3162 predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3163 match(Set dst (ReplicateB (LoadB mem)));
3164 format %{ "punpcklbw $dst,$mem\n\t"
3165 "pshuflw $dst,$dst,0x00\n\t"
3166 "punpcklqdq $dst,$dst\t! replicate16B" %}
3167 ins_encode %{
3168 __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3169 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3170 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3171 %}
3172 ins_pipe( pipe_slow );
3173 %}
3174
3175 instruct Repl32B(vecY dst, rRegI src) %{
3176 predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3177 match(Set dst (ReplicateB src));
3178 format %{ "movd $dst,$src\n\t"
3179 "punpcklbw $dst,$dst\n\t"
3180 "pshuflw $dst,$dst,0x00\n\t"
3181 "punpcklqdq $dst,$dst\n\t"
3182 "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
3183 ins_encode %{
3184 __ movdl($dst$$XMMRegister, $src$$Register);
3185 __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3186 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3187 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3188 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3189 %}
3190 ins_pipe( pipe_slow );
3191 %}
3192
3193 instruct Repl32B_mem(vecY dst, memory mem) %{
3194 predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3195 match(Set dst (ReplicateB (LoadB mem)));
3196 format %{ "punpcklbw $dst,$mem\n\t"
3197 "pshuflw $dst,$dst,0x00\n\t"
3198 "punpcklqdq $dst,$dst\n\t"
3199 "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
3200 ins_encode %{
3201 __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3202 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3203 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3204 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3205 %}
3206 ins_pipe( pipe_slow );
3207 %}
3208
3209 instruct Repl16B_imm(vecX dst, immI con) %{
3210 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3211 match(Set dst (ReplicateB con));
3212 format %{ "movq $dst,[$constantaddress]\n\t"
3213 "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3214 ins_encode %{
3215 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3216 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3217 %}
3218 ins_pipe( pipe_slow );
3219 %}
3220
3221 instruct Repl32B_imm(vecY dst, immI con) %{
3222 predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3223 match(Set dst (ReplicateB con));
3224 format %{ "movq $dst,[$constantaddress]\n\t"
3225 "punpcklqdq $dst,$dst\n\t"
3226 "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
3227 ins_encode %{
3228 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3229 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3230 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3231 %}
3232 ins_pipe( pipe_slow );
3233 %}
3234
3235 instruct Repl4S(vecD dst, rRegI src) %{
3236 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3237 match(Set dst (ReplicateS src));
3238 format %{ "movd $dst,$src\n\t"
3239 "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3240 ins_encode %{
3241 __ movdl($dst$$XMMRegister, $src$$Register);
3242 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3243 %}
3244 ins_pipe( pipe_slow );
3245 %}
3246
3247 instruct Repl4S_mem(vecD dst, memory mem) %{
3248 predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3249 match(Set dst (ReplicateS (LoadS mem)));
3250 format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3281 %}
3282
3283 instruct Repl8S_imm(vecX dst, immI con) %{
3284 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3285 match(Set dst (ReplicateS con));
3286 format %{ "movq $dst,[$constantaddress]\n\t"
3287 "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3288 ins_encode %{
3289 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3290 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3291 %}
3292 ins_pipe( pipe_slow );
3293 %}
3294
3295 instruct Repl16S(vecY dst, rRegI src) %{
3296 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3297 match(Set dst (ReplicateS src));
3298 format %{ "movd $dst,$src\n\t"
3299 "pshuflw $dst,$dst,0x00\n\t"
3300 "punpcklqdq $dst,$dst\n\t"
3301 "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
3302 ins_encode %{
3303 __ movdl($dst$$XMMRegister, $src$$Register);
3304 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3305 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3306 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3307 %}
3308 ins_pipe( pipe_slow );
3309 %}
3310
3311 instruct Repl16S_mem(vecY dst, memory mem) %{
3312 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3313 match(Set dst (ReplicateS (LoadS mem)));
3314 format %{ "pshuflw $dst,$mem,0x00\n\t"
3315 "punpcklqdq $dst,$dst\n\t"
3316 "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
3317 ins_encode %{
3318 __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3319 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3320 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3321 %}
3322 ins_pipe( pipe_slow );
3323 %}
3324
3325 instruct Repl16S_imm(vecY dst, immI con) %{
3326 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3327 match(Set dst (ReplicateS con));
3328 format %{ "movq $dst,[$constantaddress]\n\t"
3329 "punpcklqdq $dst,$dst\n\t"
3330 "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
3331 ins_encode %{
3332 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3333 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3334 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3335 %}
3336 ins_pipe( pipe_slow );
3337 %}
3338
3339 instruct Repl4I(vecX dst, rRegI src) %{
3340 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3341 match(Set dst (ReplicateI src));
3342 format %{ "movd $dst,$src\n\t"
3343 "pshufd $dst,$dst,0x00\t! replicate4I" %}
3344 ins_encode %{
3345 __ movdl($dst$$XMMRegister, $src$$Register);
3346 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3347 %}
3348 ins_pipe( pipe_slow );
3349 %}
3350
3351 instruct Repl4I_mem(vecX dst, memory mem) %{
3352 predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3353 match(Set dst (ReplicateI (LoadI mem)));
3354 format %{ "pshufd $dst,$mem,0x00\t! replicate4I" %}
3355 ins_encode %{
3356 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3357 %}
3358 ins_pipe( pipe_slow );
3359 %}
3360
3361 instruct Repl8I(vecY dst, rRegI src) %{
3362 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3363 match(Set dst (ReplicateI src));
3364 format %{ "movd $dst,$src\n\t"
3365 "pshufd $dst,$dst,0x00\n\t"
3366 "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
3367 ins_encode %{
3368 __ movdl($dst$$XMMRegister, $src$$Register);
3369 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3370 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3371 %}
3372 ins_pipe( pipe_slow );
3373 %}
3374
3375 instruct Repl8I_mem(vecY dst, memory mem) %{
3376 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3377 match(Set dst (ReplicateI (LoadI mem)));
3378 format %{ "pshufd $dst,$mem,0x00\n\t"
3379 "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
3380 ins_encode %{
3381 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3382 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3383 %}
3384 ins_pipe( pipe_slow );
3385 %}
3386
3387 instruct Repl4I_imm(vecX dst, immI con) %{
3388 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3389 match(Set dst (ReplicateI con));
3390 format %{ "movq $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3391 "punpcklqdq $dst,$dst" %}
3392 ins_encode %{
3393 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3394 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3395 %}
3396 ins_pipe( pipe_slow );
3397 %}
3398
3399 instruct Repl8I_imm(vecY dst, immI con) %{
3400 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3401 match(Set dst (ReplicateI con));
3402 format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3403 "punpcklqdq $dst,$dst\n\t"
3404 "vinserti128h $dst,$dst,$dst" %}
3405 ins_encode %{
3406 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3407 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3408 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3409 %}
3410 ins_pipe( pipe_slow );
3411 %}
3412
3413 // Long could be loaded into xmm register directly from memory.
3414 instruct Repl2L_mem(vecX dst, memory mem) %{
3415 predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3416 match(Set dst (ReplicateL (LoadL mem)));
3417 format %{ "movq $dst,$mem\n\t"
3418 "punpcklqdq $dst,$dst\t! replicate2L" %}
3419 ins_encode %{
3420 __ movq($dst$$XMMRegister, $mem$$Address);
3421 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3422 %}
3423 ins_pipe( pipe_slow );
3424 %}
3425
3426 // Replicate long (8 byte) scalar to be vector
3427 #ifdef _LP64
3428 instruct Repl4L(vecY dst, rRegL src) %{
3429 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3430 match(Set dst (ReplicateL src));
3431 format %{ "movdq $dst,$src\n\t"
3432 "punpcklqdq $dst,$dst\n\t"
3433 "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3434 ins_encode %{
3435 __ movdq($dst$$XMMRegister, $src$$Register);
3436 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3437 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3438 %}
3439 ins_pipe( pipe_slow );
3440 %}
3441 #else // _LP64
3442 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3443 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3444 match(Set dst (ReplicateL src));
3445 effect(TEMP dst, USE src, TEMP tmp);
3446 format %{ "movdl $dst,$src.lo\n\t"
3447 "movdl $tmp,$src.hi\n\t"
3448 "punpckldq $dst,$tmp\n\t"
3449 "punpcklqdq $dst,$dst\n\t"
3450 "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3451 ins_encode %{
3452 __ movdl($dst$$XMMRegister, $src$$Register);
3453 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3454 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3455 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3456 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3457 %}
3458 ins_pipe( pipe_slow );
3459 %}
3460 #endif // _LP64
3461
3462 instruct Repl4L_imm(vecY dst, immL con) %{
3463 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3464 match(Set dst (ReplicateL con));
3465 format %{ "movq $dst,[$constantaddress]\n\t"
3466 "punpcklqdq $dst,$dst\n\t"
3467 "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
3468 ins_encode %{
3469 __ movq($dst$$XMMRegister, $constantaddress($con));
3470 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3471 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3472 %}
3473 ins_pipe( pipe_slow );
3474 %}
3475
3476 instruct Repl4L_mem(vecY dst, memory mem) %{
3477 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3478 match(Set dst (ReplicateL (LoadL mem)));
3479 format %{ "movq $dst,$mem\n\t"
3480 "punpcklqdq $dst,$dst\n\t"
3481 "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3482 ins_encode %{
3483 __ movq($dst$$XMMRegister, $mem$$Address);
3484 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3485 __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3486 %}
3487 ins_pipe( pipe_slow );
3488 %}
3489
3490 instruct Repl2F_mem(vecD dst, memory mem) %{
3491 predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3492 match(Set dst (ReplicateF (LoadF mem)));
3493 format %{ "pshufd $dst,$mem,0x00\t! replicate2F" %}
3494 ins_encode %{
3495 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3496 %}
3497 ins_pipe( pipe_slow );
3498 %}
3499
3500 instruct Repl4F_mem(vecX dst, memory mem) %{
3501 predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3502 match(Set dst (ReplicateF (LoadF mem)));
3503 format %{ "pshufd $dst,$mem,0x00\t! replicate4F" %}
3504 ins_encode %{
3505 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3506 %}
3507 ins_pipe( pipe_slow );
3508 %}
3509
3510 instruct Repl8F(vecY dst, regF src) %{
3511 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3512 match(Set dst (ReplicateF src));
3513 format %{ "pshufd $dst,$src,0x00\n\t"
3514 "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
3515 ins_encode %{
3516 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3517 __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3518 %}
3519 ins_pipe( pipe_slow );
3520 %}
3521
3522 instruct Repl8F_mem(vecY dst, memory mem) %{
3523 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3524 match(Set dst (ReplicateF (LoadF mem)));
3525 format %{ "pshufd $dst,$mem,0x00\n\t"
3526 "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
3527 ins_encode %{
3528 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3529 __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3530 %}
3531 ins_pipe( pipe_slow );
3532 %}
3533
3534 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3535 predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3536 match(Set dst (ReplicateF zero));
3537 format %{ "xorps $dst,$dst\t! replicate2F zero" %}
3538 ins_encode %{
3539 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3540 %}
3541 ins_pipe( fpu_reg_reg );
3542 %}
3543
3544 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3545 predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3546 match(Set dst (ReplicateF zero));
3547 format %{ "xorps $dst,$dst\t! replicate4F zero" %}
3548 ins_encode %{
3549 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3559 int vector_len = 1;
3560 __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3561 %}
3562 ins_pipe( fpu_reg_reg );
3563 %}
3564
3565 instruct Repl2D_mem(vecX dst, memory mem) %{
3566 predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3567 match(Set dst (ReplicateD (LoadD mem)));
3568 format %{ "pshufd $dst,$mem,0x44\t! replicate2D" %}
3569 ins_encode %{
3570 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3571 %}
3572 ins_pipe( pipe_slow );
3573 %}
3574
3575 instruct Repl4D(vecY dst, regD src) %{
3576 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3577 match(Set dst (ReplicateD src));
3578 format %{ "pshufd $dst,$src,0x44\n\t"
3579 "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
3580 ins_encode %{
3581 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3582 __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3583 %}
3584 ins_pipe( pipe_slow );
3585 %}
3586
3587 instruct Repl4D_mem(vecY dst, memory mem) %{
3588 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3589 match(Set dst (ReplicateD (LoadD mem)));
3590 format %{ "pshufd $dst,$mem,0x44\n\t"
3591 "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
3592 ins_encode %{
3593 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3594 __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3595 %}
3596 ins_pipe( pipe_slow );
3597 %}
3598
3599 // Replicate double (8 byte) scalar zero to be vector
3600 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3601 predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3602 match(Set dst (ReplicateD zero));
3603 format %{ "xorpd $dst,$dst\t! replicate2D zero" %}
3604 ins_encode %{
3605 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3606 %}
3607 ins_pipe( fpu_reg_reg );
3608 %}
3609
3610 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3611 predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3612 match(Set dst (ReplicateD zero));
3613 format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3614 ins_encode %{
4774 "movd $dst,$tmp2\t! add reduction4I" %}
4775 ins_encode %{
4776 int vector_len = 0;
4777 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4778 __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4779 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4780 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4781 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4782 __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4783 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4784 %}
4785 ins_pipe( pipe_slow );
4786 %}
4787
4788 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4789 predicate(VM_Version::supports_avxonly());
4790 match(Set dst (AddReductionVI src1 src2));
4791 effect(TEMP tmp, TEMP tmp2);
4792 format %{ "vphaddd $tmp,$src2,$src2\n\t"
4793 "vphaddd $tmp,$tmp,$tmp2\n\t"
4794 "vextracti128 $tmp2,$tmp\n\t"
4795 "vpaddd $tmp,$tmp,$tmp2\n\t"
4796 "movd $tmp2,$src1\n\t"
4797 "vpaddd $tmp2,$tmp2,$tmp\n\t"
4798 "movd $dst,$tmp2\t! add reduction8I" %}
4799 ins_encode %{
4800 int vector_len = 1;
4801 __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4802 __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4803 __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
4804 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4805 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4806 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4807 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4808 %}
4809 ins_pipe( pipe_slow );
4810 %}
4811
4812 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4813 predicate(UseAVX > 2);
4814 match(Set dst (AddReductionVI src1 src2));
4815 effect(TEMP tmp, TEMP tmp2);
4816 format %{ "vextracti128 $tmp,$src2\n\t"
4817 "vpaddd $tmp,$tmp,$src2\n\t"
4818 "pshufd $tmp2,$tmp,0xE\n\t"
4819 "vpaddd $tmp,$tmp,$tmp2\n\t"
4820 "pshufd $tmp2,$tmp,0x1\n\t"
4821 "vpaddd $tmp,$tmp,$tmp2\n\t"
4822 "movd $tmp2,$src1\n\t"
4823 "vpaddd $tmp2,$tmp,$tmp2\n\t"
4824 "movd $dst,$tmp2\t! add reduction8I" %}
4825 ins_encode %{
4826 int vector_len = 0;
4827 __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4828 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4829 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4830 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4831 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4832 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4833 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4834 __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4835 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4836 %}
4837 ins_pipe( pipe_slow );
4838 %}
4839
4840 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4841 predicate(UseAVX > 2);
4842 match(Set dst (AddReductionVI src1 src2));
4843 effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4844 format %{ "vextracti64x4 $tmp3,$src2,0x1\n\t"
4845 "vpaddd $tmp3,$tmp3,$src2\n\t"
4846 "vextracti128 $tmp,$tmp3\n\t"
4847 "vpaddd $tmp,$tmp,$tmp3\n\t"
4848 "pshufd $tmp2,$tmp,0xE\n\t"
4849 "vpaddd $tmp,$tmp,$tmp2\n\t"
4850 "pshufd $tmp2,$tmp,0x1\n\t"
4851 "vpaddd $tmp,$tmp,$tmp2\n\t"
4852 "movd $tmp2,$src1\n\t"
4853 "vpaddd $tmp2,$tmp,$tmp2\n\t"
4854 "movd $dst,$tmp2\t! mul reduction16I" %}
4855 ins_encode %{
4856 __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4857 __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4858 __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
4859 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4860 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4861 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4862 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4863 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4864 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4865 __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4866 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4867 %}
4868 ins_pipe( pipe_slow );
4869 %}
4870
4871 #ifdef _LP64
4872 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4873 predicate(UseAVX > 2);
4874 match(Set dst (AddReductionVL src1 src2));
4875 effect(TEMP tmp, TEMP tmp2);
4876 format %{ "pshufd $tmp2,$src2,0xE\n\t"
4877 "vpaddq $tmp,$src2,$tmp2\n\t"
4878 "movdq $tmp2,$src1\n\t"
4879 "vpaddq $tmp2,$tmp,$tmp2\n\t"
4880 "movdq $dst,$tmp2\t! add reduction2L" %}
4881 ins_encode %{
4882 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4883 __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4884 __ movdq($tmp2$$XMMRegister, $src1$$Register);
4885 __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4886 __ movdq($dst$$Register, $tmp2$$XMMRegister);
4887 %}
4888 ins_pipe( pipe_slow );
4889 %}
4890
4891 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4892 predicate(UseAVX > 2);
4893 match(Set dst (AddReductionVL src1 src2));
4894 effect(TEMP tmp, TEMP tmp2);
4895 format %{ "vextracti128 $tmp,$src2\n\t"
4896 "vpaddq $tmp2,$tmp,$src2\n\t"
4897 "pshufd $tmp,$tmp2,0xE\n\t"
4898 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4899 "movdq $tmp,$src1\n\t"
4900 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4901 "movdq $dst,$tmp2\t! add reduction4L" %}
4902 ins_encode %{
4903 __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4904 __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4905 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4906 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4907 __ movdq($tmp$$XMMRegister, $src1$$Register);
4908 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4909 __ movdq($dst$$Register, $tmp2$$XMMRegister);
4910 %}
4911 ins_pipe( pipe_slow );
4912 %}
4913
4914 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4915 predicate(UseAVX > 2);
4916 match(Set dst (AddReductionVL src1 src2));
4917 effect(TEMP tmp, TEMP tmp2);
4918 format %{ "vextracti64x4 $tmp2,$src2,0x1\n\t"
4919 "vpaddq $tmp2,$tmp2,$src2\n\t"
4920 "vextracti128 $tmp,$tmp2\n\t"
4921 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4922 "pshufd $tmp,$tmp2,0xE\n\t"
4923 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4924 "movdq $tmp,$src1\n\t"
4925 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4926 "movdq $dst,$tmp2\t! add reduction8L" %}
4927 ins_encode %{
4928 __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4929 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4930 __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
4931 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4932 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4933 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4934 __ movdq($tmp$$XMMRegister, $src1$$Register);
4935 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4936 __ movdq($dst$$Register, $tmp2$$XMMRegister);
4937 %}
4938 ins_pipe( pipe_slow );
4939 %}
4940 #endif
4941
4942 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4943 predicate(UseSSE >= 1 && UseAVX == 0);
4944 match(Set dst (AddReductionVF dst src2));
4945 effect(TEMP dst, TEMP tmp);
4946 format %{ "addss $dst,$src2\n\t"
4947 "pshufd $tmp,$src2,0x01\n\t"
4948 "addss $dst,$tmp\t! add reduction2F" %}
4949 ins_encode %{
4950 __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5009 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5010 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5011 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5012 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5013 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5014 %}
5015 ins_pipe( pipe_slow );
5016 %}
5017
5018 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5019 predicate(UseAVX > 0);
5020 match(Set dst (AddReductionVF dst src2));
5021 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5022 format %{ "vaddss $dst,$dst,$src2\n\t"
5023 "pshufd $tmp,$src2,0x01\n\t"
5024 "vaddss $dst,$dst,$tmp\n\t"
5025 "pshufd $tmp,$src2,0x02\n\t"
5026 "vaddss $dst,$dst,$tmp\n\t"
5027 "pshufd $tmp,$src2,0x03\n\t"
5028 "vaddss $dst,$dst,$tmp\n\t"
5029 "vextractf128 $tmp2,$src2\n\t"
5030 "vaddss $dst,$dst,$tmp2\n\t"
5031 "pshufd $tmp,$tmp2,0x01\n\t"
5032 "vaddss $dst,$dst,$tmp\n\t"
5033 "pshufd $tmp,$tmp2,0x02\n\t"
5034 "vaddss $dst,$dst,$tmp\n\t"
5035 "pshufd $tmp,$tmp2,0x03\n\t"
5036 "vaddss $dst,$dst,$tmp\t! add reduction8F" %}
5037 ins_encode %{
5038 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5039 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5040 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5041 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5042 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5043 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5044 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5045 __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5046 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5047 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5048 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5049 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5050 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5051 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5052 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5053 %}
5054 ins_pipe( pipe_slow );
5055 %}
5056
5057 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5058 predicate(UseAVX > 2);
5059 match(Set dst (AddReductionVF dst src2));
5060 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5061 format %{ "vaddss $dst,$dst,$src2\n\t"
5062 "pshufd $tmp,$src2,0x01\n\t"
5063 "vaddss $dst,$dst,$tmp\n\t"
5064 "pshufd $tmp,$src2,0x02\n\t"
5065 "vaddss $dst,$dst,$tmp\n\t"
5066 "pshufd $tmp,$src2,0x03\n\t"
5067 "vaddss $dst,$dst,$tmp\n\t"
5068 "vextractf32x4 $tmp2,$src2, 0x1\n\t"
5069 "vaddss $dst,$dst,$tmp2\n\t"
5070 "pshufd $tmp,$tmp2,0x01\n\t"
5071 "vaddss $dst,$dst,$tmp\n\t"
5072 "pshufd $tmp,$tmp2,0x02\n\t"
5073 "vaddss $dst,$dst,$tmp\n\t"
5074 "pshufd $tmp,$tmp2,0x03\n\t"
5075 "vaddss $dst,$dst,$tmp\n\t"
5076 "vextractf32x4 $tmp2,$src2, 0x2\n\t"
5077 "vaddss $dst,$dst,$tmp2\n\t"
5078 "pshufd $tmp,$tmp2,0x01\n\t"
5079 "vaddss $dst,$dst,$tmp\n\t"
5080 "pshufd $tmp,$tmp2,0x02\n\t"
5081 "vaddss $dst,$dst,$tmp\n\t"
5082 "pshufd $tmp,$tmp2,0x03\n\t"
5083 "vaddss $dst,$dst,$tmp\n\t"
5084 "vextractf32x4 $tmp2,$src2, 0x3\n\t"
5085 "vaddss $dst,$dst,$tmp2\n\t"
5086 "pshufd $tmp,$tmp2,0x01\n\t"
5087 "vaddss $dst,$dst,$tmp\n\t"
5088 "pshufd $tmp,$tmp2,0x02\n\t"
5089 "vaddss $dst,$dst,$tmp\n\t"
5090 "pshufd $tmp,$tmp2,0x03\n\t"
5091 "vaddss $dst,$dst,$tmp\t! add reduction16F" %}
5092 ins_encode %{
5093 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5094 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5095 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5096 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5097 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5098 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5099 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5100 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5101 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5102 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5103 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5104 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5105 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5106 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5107 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5108 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5109 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5110 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5111 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5112 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5113 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5114 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5115 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5116 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5117 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5118 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5119 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5120 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5121 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5122 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5123 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5124 %}
5125 ins_pipe( pipe_slow );
5126 %}
5127
5128 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5129 predicate(UseSSE >= 1 && UseAVX == 0);
5130 match(Set dst (AddReductionVD dst src2));
5131 effect(TEMP tmp, TEMP dst);
5132 format %{ "addsd $dst,$src2\n\t"
5133 "pshufd $tmp,$src2,0xE\n\t"
5134 "addsd $dst,$tmp\t! add reduction2D" %}
5135 ins_encode %{
5136 __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5145 match(Set dst (AddReductionVD dst src2));
5146 effect(TEMP tmp, TEMP dst);
5147 format %{ "vaddsd $dst,$dst,$src2\n\t"
5148 "pshufd $tmp,$src2,0xE\n\t"
5149 "vaddsd $dst,$dst,$tmp\t! add reduction2D" %}
5150 ins_encode %{
5151 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5152 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5153 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5154 %}
5155 ins_pipe( pipe_slow );
5156 %}
5157
5158 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5159 predicate(UseAVX > 0);
5160 match(Set dst (AddReductionVD dst src2));
5161 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5162 format %{ "vaddsd $dst,$dst,$src2\n\t"
5163 "pshufd $tmp,$src2,0xE\n\t"
5164 "vaddsd $dst,$dst,$tmp\n\t"
5165 "vextractf32x4h $tmp2,$src2, 0x1\n\t"
5166 "vaddsd $dst,$dst,$tmp2\n\t"
5167 "pshufd $tmp,$tmp2,0xE\n\t"
5168 "vaddsd $dst,$dst,$tmp\t! add reduction4D" %}
5169 ins_encode %{
5170 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5171 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5172 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5173 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5174 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5175 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5176 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5177 %}
5178 ins_pipe( pipe_slow );
5179 %}
5180
5181 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5182 predicate(UseAVX > 2);
5183 match(Set dst (AddReductionVD dst src2));
5184 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5185 format %{ "vaddsd $dst,$dst,$src2\n\t"
5186 "pshufd $tmp,$src2,0xE\n\t"
5187 "vaddsd $dst,$dst,$tmp\n\t"
5188 "vextractf32x4 $tmp2,$src2, 0x1\n\t"
5189 "vaddsd $dst,$dst,$tmp2\n\t"
5190 "pshufd $tmp,$tmp2,0xE\n\t"
5191 "vaddsd $dst,$dst,$tmp\n\t"
5192 "vextractf32x4 $tmp2,$src2, 0x2\n\t"
5193 "vaddsd $dst,$dst,$tmp2\n\t"
5194 "pshufd $tmp,$tmp2,0xE\n\t"
5195 "vaddsd $dst,$dst,$tmp\n\t"
5196 "vextractf32x4 $tmp2,$src2, 0x3\n\t"
5197 "vaddsd $dst,$dst,$tmp2\n\t"
5198 "pshufd $tmp,$tmp2,0xE\n\t"
5199 "vaddsd $dst,$dst,$tmp\t! add reduction8D" %}
5200 ins_encode %{
5201 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5202 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5203 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5204 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5205 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5206 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5207 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5208 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5209 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5210 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5211 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5212 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5213 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5214 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5215 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5216 %}
5217 ins_pipe( pipe_slow );
5218 %}
5219
5220 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5221 predicate(UseSSE > 3 && UseAVX == 0);
5222 match(Set dst (MulReductionVI src1 src2));
5223 effect(TEMP tmp, TEMP tmp2);
5224 format %{ "pshufd $tmp2,$src2,0x1\n\t"
5225 "pmulld $tmp2,$src2\n\t"
5226 "movd $tmp,$src1\n\t"
5227 "pmulld $tmp2,$tmp\n\t"
5228 "movd $dst,$tmp2\t! mul reduction2I" %}
5229 ins_encode %{
5230 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5231 __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5232 __ movdl($tmp$$XMMRegister, $src1$$Register);
5290 "movd $tmp2,$src1\n\t"
5291 "vpmulld $tmp2,$tmp,$tmp2\n\t"
5292 "movd $dst,$tmp2\t! mul reduction4I" %}
5293 ins_encode %{
5294 int vector_len = 0;
5295 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5296 __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5297 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5298 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5299 __ movdl($tmp2$$XMMRegister, $src1$$Register);
5300 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5301 __ movdl($dst$$Register, $tmp2$$XMMRegister);
5302 %}
5303 ins_pipe( pipe_slow );
5304 %}
5305
5306 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5307 predicate(UseAVX > 0);
5308 match(Set dst (MulReductionVI src1 src2));
5309 effect(TEMP tmp, TEMP tmp2);
5310 format %{ "vextracti128 $tmp,$src2\n\t"
5311 "vpmulld $tmp,$tmp,$src2\n\t"
5312 "pshufd $tmp2,$tmp,0xE\n\t"
5313 "vpmulld $tmp,$tmp,$tmp2\n\t"
5314 "pshufd $tmp2,$tmp,0x1\n\t"
5315 "vpmulld $tmp,$tmp,$tmp2\n\t"
5316 "movd $tmp2,$src1\n\t"
5317 "vpmulld $tmp2,$tmp,$tmp2\n\t"
5318 "movd $dst,$tmp2\t! mul reduction8I" %}
5319 ins_encode %{
5320 int vector_len = 0;
5321 __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5322 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5323 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5324 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5325 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5326 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5327 __ movdl($tmp2$$XMMRegister, $src1$$Register);
5328 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5329 __ movdl($dst$$Register, $tmp2$$XMMRegister);
5330 %}
5331 ins_pipe( pipe_slow );
5332 %}
5333
5334 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5335 predicate(UseAVX > 2);
5336 match(Set dst (MulReductionVI src1 src2));
5337 effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5338 format %{ "vextracti64x4 $tmp3,$src2,0x1\n\t"
5339 "vpmulld $tmp3,$tmp3,$src2\n\t"
5340 "vextracti128 $tmp,$tmp3\n\t"
5341 "vpmulld $tmp,$tmp,$src2\n\t"
5342 "pshufd $tmp2,$tmp,0xE\n\t"
5343 "vpmulld $tmp,$tmp,$tmp2\n\t"
5344 "pshufd $tmp2,$tmp,0x1\n\t"
5345 "vpmulld $tmp,$tmp,$tmp2\n\t"
5346 "movd $tmp2,$src1\n\t"
5347 "vpmulld $tmp2,$tmp,$tmp2\n\t"
5348 "movd $dst,$tmp2\t! mul reduction16I" %}
5349 ins_encode %{
5350 __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5351 __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5352 __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
5353 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5354 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5355 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5356 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5357 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5358 __ movdl($tmp2$$XMMRegister, $src1$$Register);
5359 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5360 __ movdl($dst$$Register, $tmp2$$XMMRegister);
5361 %}
5362 ins_pipe( pipe_slow );
5363 %}
5364
5365 #ifdef _LP64
5366 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5367 predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5368 match(Set dst (MulReductionVL src1 src2));
5369 effect(TEMP tmp, TEMP tmp2);
5370 format %{ "pshufd $tmp2,$src2,0xE\n\t"
5371 "vpmullq $tmp,$src2,$tmp2\n\t"
5372 "movdq $tmp2,$src1\n\t"
5373 "vpmullq $tmp2,$tmp,$tmp2\n\t"
5374 "movdq $dst,$tmp2\t! mul reduction2L" %}
5375 ins_encode %{
5376 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5377 __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5378 __ movdq($tmp2$$XMMRegister, $src1$$Register);
5379 __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5380 __ movdq($dst$$Register, $tmp2$$XMMRegister);
5381 %}
5382 ins_pipe( pipe_slow );
5383 %}
5384
5385 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5386 predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5387 match(Set dst (MulReductionVL src1 src2));
5388 effect(TEMP tmp, TEMP tmp2);
5389 format %{ "vextracti128 $tmp,$src2\n\t"
5390 "vpmullq $tmp2,$tmp,$src2\n\t"
5391 "pshufd $tmp,$tmp2,0xE\n\t"
5392 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5393 "movdq $tmp,$src1\n\t"
5394 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5395 "movdq $dst,$tmp2\t! mul reduction4L" %}
5396 ins_encode %{
5397 __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5398 __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5399 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5400 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5401 __ movdq($tmp$$XMMRegister, $src1$$Register);
5402 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5403 __ movdq($dst$$Register, $tmp2$$XMMRegister);
5404 %}
5405 ins_pipe( pipe_slow );
5406 %}
5407
5408 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5409 predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5410 match(Set dst (MulReductionVL src1 src2));
5411 effect(TEMP tmp, TEMP tmp2);
5412 format %{ "vextracti64x4 $tmp2,$src2,0x1\n\t"
5413 "vpmullq $tmp2,$tmp2,$src2\n\t"
5414 "vextracti128 $tmp,$tmp2\n\t"
5415 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5416 "pshufd $tmp,$tmp2,0xE\n\t"
5417 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5418 "movdq $tmp,$src1\n\t"
5419 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5420 "movdq $dst,$tmp2\t! mul reduction8L" %}
5421 ins_encode %{
5422 __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5423 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5424 __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
5425 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5426 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5427 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5428 __ movdq($tmp$$XMMRegister, $src1$$Register);
5429 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5430 __ movdq($dst$$Register, $tmp2$$XMMRegister);
5431 %}
5432 ins_pipe( pipe_slow );
5433 %}
5434 #endif
5435
5436 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5437 predicate(UseSSE >= 1 && UseAVX == 0);
5438 match(Set dst (MulReductionVF dst src2));
5439 effect(TEMP dst, TEMP tmp);
5440 format %{ "mulss $dst,$src2\n\t"
5441 "pshufd $tmp,$src2,0x01\n\t"
5442 "mulss $dst,$tmp\t! mul reduction2F" %}
5443 ins_encode %{
5444 __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5503 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5504 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5505 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5506 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5507 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5508 %}
5509 ins_pipe( pipe_slow );
5510 %}
5511
5512 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5513 predicate(UseAVX > 0);
5514 match(Set dst (MulReductionVF dst src2));
5515 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5516 format %{ "vmulss $dst,$dst,$src2\n\t"
5517 "pshufd $tmp,$src2,0x01\n\t"
5518 "vmulss $dst,$dst,$tmp\n\t"
5519 "pshufd $tmp,$src2,0x02\n\t"
5520 "vmulss $dst,$dst,$tmp\n\t"
5521 "pshufd $tmp,$src2,0x03\n\t"
5522 "vmulss $dst,$dst,$tmp\n\t"
5523 "vextractf128 $tmp2,$src2\n\t"
5524 "vmulss $dst,$dst,$tmp2\n\t"
5525 "pshufd $tmp,$tmp2,0x01\n\t"
5526 "vmulss $dst,$dst,$tmp\n\t"
5527 "pshufd $tmp,$tmp2,0x02\n\t"
5528 "vmulss $dst,$dst,$tmp\n\t"
5529 "pshufd $tmp,$tmp2,0x03\n\t"
5530 "vmulss $dst,$dst,$tmp\t! mul reduction8F" %}
5531 ins_encode %{
5532 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5533 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5534 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5535 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5536 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5537 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5538 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5539 __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5540 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5541 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5542 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5543 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5544 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5545 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5546 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5547 %}
5548 ins_pipe( pipe_slow );
5549 %}
5550
5551 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5552 predicate(UseAVX > 2);
5553 match(Set dst (MulReductionVF dst src2));
5554 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5555 format %{ "vmulss $dst,$dst,$src2\n\t"
5556 "pshufd $tmp,$src2,0x01\n\t"
5557 "vmulss $dst,$dst,$tmp\n\t"
5558 "pshufd $tmp,$src2,0x02\n\t"
5559 "vmulss $dst,$dst,$tmp\n\t"
5560 "pshufd $tmp,$src2,0x03\n\t"
5561 "vmulss $dst,$dst,$tmp\n\t"
5562 "vextractf32x4 $tmp2,$src2, 0x1\n\t"
5563 "vmulss $dst,$dst,$tmp2\n\t"
5564 "pshufd $tmp,$tmp2,0x01\n\t"
5565 "vmulss $dst,$dst,$tmp\n\t"
5566 "pshufd $tmp,$tmp2,0x02\n\t"
5567 "vmulss $dst,$dst,$tmp\n\t"
5568 "pshufd $tmp,$tmp2,0x03\n\t"
5569 "vmulss $dst,$dst,$tmp\n\t"
5570 "vextractf32x4 $tmp2,$src2, 0x2\n\t"
5571 "vmulss $dst,$dst,$tmp2\n\t"
5572 "pshufd $tmp,$tmp2,0x01\n\t"
5573 "vmulss $dst,$dst,$tmp\n\t"
5574 "pshufd $tmp,$tmp2,0x02\n\t"
5575 "vmulss $dst,$dst,$tmp\n\t"
5576 "pshufd $tmp,$tmp2,0x03\n\t"
5577 "vmulss $dst,$dst,$tmp\n\t"
5578 "vextractf32x4 $tmp2,$src2, 0x3\n\t"
5579 "vmulss $dst,$dst,$tmp2\n\t"
5580 "pshufd $tmp,$tmp2,0x01\n\t"
5581 "vmulss $dst,$dst,$tmp\n\t"
5582 "pshufd $tmp,$tmp2,0x02\n\t"
5583 "vmulss $dst,$dst,$tmp\n\t"
5584 "pshufd $tmp,$tmp2,0x03\n\t"
5585 "vmulss $dst,$dst,$tmp\t! mul reduction16F" %}
5586 ins_encode %{
5587 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5588 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5589 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5590 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5591 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5592 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5593 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5594 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5595 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5596 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5597 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5598 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5599 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5600 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5601 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5602 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5603 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5604 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5605 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5606 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5607 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5608 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5609 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5610 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5611 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5612 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5613 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5614 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5615 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5616 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5617 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5618 %}
5619 ins_pipe( pipe_slow );
5620 %}
5621
5622 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5623 predicate(UseSSE >= 1 && UseAVX == 0);
5624 match(Set dst (MulReductionVD dst src2));
5625 effect(TEMP dst, TEMP tmp);
5626 format %{ "mulsd $dst,$src2\n\t"
5627 "pshufd $tmp,$src2,0xE\n\t"
5628 "mulsd $dst,$tmp\t! mul reduction2D" %}
5629 ins_encode %{
5630 __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5639 match(Set dst (MulReductionVD dst src2));
5640 effect(TEMP tmp, TEMP dst);
5641 format %{ "vmulsd $dst,$dst,$src2\n\t"
5642 "pshufd $tmp,$src2,0xE\n\t"
5643 "vmulsd $dst,$dst,$tmp\t! mul reduction2D" %}
5644 ins_encode %{
5645 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5646 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5647 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5648 %}
5649 ins_pipe( pipe_slow );
5650 %}
5651
5652 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5653 predicate(UseAVX > 0);
5654 match(Set dst (MulReductionVD dst src2));
5655 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5656 format %{ "vmulsd $dst,$dst,$src2\n\t"
5657 "pshufd $tmp,$src2,0xE\n\t"
5658 "vmulsd $dst,$dst,$tmp\n\t"
5659 "vextractf128 $tmp2,$src2\n\t"
5660 "vmulsd $dst,$dst,$tmp2\n\t"
5661 "pshufd $tmp,$tmp2,0xE\n\t"
5662 "vmulsd $dst,$dst,$tmp\t! mul reduction4D" %}
5663 ins_encode %{
5664 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5665 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5666 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5667 __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5668 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5669 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5670 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5671 %}
5672 ins_pipe( pipe_slow );
5673 %}
5674
5675 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5676 predicate(UseAVX > 2);
5677 match(Set dst (MulReductionVD dst src2));
5678 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5679 format %{ "vmulsd $dst,$dst,$src2\n\t"
5680 "pshufd $tmp,$src2,0xE\n\t"
5681 "vmulsd $dst,$dst,$tmp\n\t"
5682 "vextractf32x4 $tmp2,$src2, 0x1\n\t"
5683 "vmulsd $dst,$dst,$tmp2\n\t"
5684 "pshufd $tmp,$src2,0xE\n\t"
5685 "vmulsd $dst,$dst,$tmp\n\t"
5686 "vextractf32x4 $tmp2,$src2, 0x2\n\t"
5687 "vmulsd $dst,$dst,$tmp2\n\t"
5688 "pshufd $tmp,$tmp2,0xE\n\t"
5689 "vmulsd $dst,$dst,$tmp\n\t"
5690 "vextractf32x4 $tmp2,$src2, 0x3\n\t"
5691 "vmulsd $dst,$dst,$tmp2\n\t"
5692 "pshufd $tmp,$tmp2,0xE\n\t"
5693 "vmulsd $dst,$dst,$tmp\t! mul reduction8D" %}
5694 ins_encode %{
5695 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5696 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5697 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5698 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5699 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5700 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5701 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5702 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5703 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5704 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5705 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5706 __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5707 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5708 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5709 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5710 %}
5711 ins_pipe( pipe_slow );
5712 %}
5713
5714 // ====================VECTOR ARITHMETIC=======================================
5715
5716 // --------------------------------- ADD --------------------------------------
5717
5718 // Bytes vector add
5719 instruct vadd4B(vecS dst, vecS src) %{
5720 predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5721 match(Set dst (AddVB dst src));
5722 format %{ "paddb $dst,$src\t! add packed4B" %}
5723 ins_encode %{
5724 __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5725 %}
5726 ins_pipe( pipe_slow );
|
3162 predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3163 match(Set dst (ReplicateB (LoadB mem)));
3164 format %{ "punpcklbw $dst,$mem\n\t"
3165 "pshuflw $dst,$dst,0x00\n\t"
3166 "punpcklqdq $dst,$dst\t! replicate16B" %}
3167 ins_encode %{
3168 __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3169 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3170 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3171 %}
3172 ins_pipe( pipe_slow );
3173 %}
3174
3175 instruct Repl32B(vecY dst, rRegI src) %{
3176 predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3177 match(Set dst (ReplicateB src));
3178 format %{ "movd $dst,$src\n\t"
3179 "punpcklbw $dst,$dst\n\t"
3180 "pshuflw $dst,$dst,0x00\n\t"
3181 "punpcklqdq $dst,$dst\n\t"
3182 "vinserti128_high $dst,$dst\t! replicate32B" %}
3183 ins_encode %{
3184 __ movdl($dst$$XMMRegister, $src$$Register);
3185 __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3186 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3187 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3188 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3189 %}
3190 ins_pipe( pipe_slow );
3191 %}
3192
3193 instruct Repl32B_mem(vecY dst, memory mem) %{
3194 predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3195 match(Set dst (ReplicateB (LoadB mem)));
3196 format %{ "punpcklbw $dst,$mem\n\t"
3197 "pshuflw $dst,$dst,0x00\n\t"
3198 "punpcklqdq $dst,$dst\n\t"
3199 "vinserti128_high $dst,$dst\t! replicate32B" %}
3200 ins_encode %{
3201 __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3202 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3203 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3204 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3205 %}
3206 ins_pipe( pipe_slow );
3207 %}
3208
3209 instruct Repl16B_imm(vecX dst, immI con) %{
3210 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3211 match(Set dst (ReplicateB con));
3212 format %{ "movq $dst,[$constantaddress]\n\t"
3213 "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3214 ins_encode %{
3215 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3216 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3217 %}
3218 ins_pipe( pipe_slow );
3219 %}
3220
3221 instruct Repl32B_imm(vecY dst, immI con) %{
3222 predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3223 match(Set dst (ReplicateB con));
3224 format %{ "movq $dst,[$constantaddress]\n\t"
3225 "punpcklqdq $dst,$dst\n\t"
3226 "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3227 ins_encode %{
3228 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3229 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3230 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3231 %}
3232 ins_pipe( pipe_slow );
3233 %}
3234
3235 instruct Repl4S(vecD dst, rRegI src) %{
3236 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3237 match(Set dst (ReplicateS src));
3238 format %{ "movd $dst,$src\n\t"
3239 "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3240 ins_encode %{
3241 __ movdl($dst$$XMMRegister, $src$$Register);
3242 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3243 %}
3244 ins_pipe( pipe_slow );
3245 %}
3246
3247 instruct Repl4S_mem(vecD dst, memory mem) %{
3248 predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3249 match(Set dst (ReplicateS (LoadS mem)));
3250 format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3281 %}
3282
3283 instruct Repl8S_imm(vecX dst, immI con) %{
3284 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3285 match(Set dst (ReplicateS con));
3286 format %{ "movq $dst,[$constantaddress]\n\t"
3287 "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3288 ins_encode %{
3289 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3290 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3291 %}
3292 ins_pipe( pipe_slow );
3293 %}
3294
3295 instruct Repl16S(vecY dst, rRegI src) %{
3296 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3297 match(Set dst (ReplicateS src));
3298 format %{ "movd $dst,$src\n\t"
3299 "pshuflw $dst,$dst,0x00\n\t"
3300 "punpcklqdq $dst,$dst\n\t"
3301 "vinserti128_high $dst,$dst\t! replicate16S" %}
3302 ins_encode %{
3303 __ movdl($dst$$XMMRegister, $src$$Register);
3304 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3305 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3306 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3307 %}
3308 ins_pipe( pipe_slow );
3309 %}
3310
3311 instruct Repl16S_mem(vecY dst, memory mem) %{
3312 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3313 match(Set dst (ReplicateS (LoadS mem)));
3314 format %{ "pshuflw $dst,$mem,0x00\n\t"
3315 "punpcklqdq $dst,$dst\n\t"
3316 "vinserti128_high $dst,$dst\t! replicate16S" %}
3317 ins_encode %{
3318 __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3319 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3320 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3321 %}
3322 ins_pipe( pipe_slow );
3323 %}
3324
3325 instruct Repl16S_imm(vecY dst, immI con) %{
3326 predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3327 match(Set dst (ReplicateS con));
3328 format %{ "movq $dst,[$constantaddress]\n\t"
3329 "punpcklqdq $dst,$dst\n\t"
3330 "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3331 ins_encode %{
3332 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3333 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3334 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3335 %}
3336 ins_pipe( pipe_slow );
3337 %}
3338
3339 instruct Repl4I(vecX dst, rRegI src) %{
3340 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3341 match(Set dst (ReplicateI src));
3342 format %{ "movd $dst,$src\n\t"
3343 "pshufd $dst,$dst,0x00\t! replicate4I" %}
3344 ins_encode %{
3345 __ movdl($dst$$XMMRegister, $src$$Register);
3346 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3347 %}
3348 ins_pipe( pipe_slow );
3349 %}
3350
3351 instruct Repl4I_mem(vecX dst, memory mem) %{
3352 predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3353 match(Set dst (ReplicateI (LoadI mem)));
3354 format %{ "pshufd $dst,$mem,0x00\t! replicate4I" %}
3355 ins_encode %{
3356 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3357 %}
3358 ins_pipe( pipe_slow );
3359 %}
3360
3361 instruct Repl8I(vecY dst, rRegI src) %{
3362 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3363 match(Set dst (ReplicateI src));
3364 format %{ "movd $dst,$src\n\t"
3365 "pshufd $dst,$dst,0x00\n\t"
3366 "vinserti128_high $dst,$dst\t! replicate8I" %}
3367 ins_encode %{
3368 __ movdl($dst$$XMMRegister, $src$$Register);
3369 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3370 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3371 %}
3372 ins_pipe( pipe_slow );
3373 %}
3374
3375 instruct Repl8I_mem(vecY dst, memory mem) %{
3376 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3377 match(Set dst (ReplicateI (LoadI mem)));
3378 format %{ "pshufd $dst,$mem,0x00\n\t"
3379 "vinserti128_high $dst,$dst\t! replicate8I" %}
3380 ins_encode %{
3381 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3382 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3383 %}
3384 ins_pipe( pipe_slow );
3385 %}
3386
3387 instruct Repl4I_imm(vecX dst, immI con) %{
3388 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3389 match(Set dst (ReplicateI con));
3390 format %{ "movq $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3391 "punpcklqdq $dst,$dst" %}
3392 ins_encode %{
3393 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3394 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3395 %}
3396 ins_pipe( pipe_slow );
3397 %}
3398
3399 instruct Repl8I_imm(vecY dst, immI con) %{
3400 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3401 match(Set dst (ReplicateI con));
3402 format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3403 "punpcklqdq $dst,$dst\n\t"
3404 "vinserti128_high $dst,$dst" %}
3405 ins_encode %{
3406 __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3407 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3408 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3409 %}
3410 ins_pipe( pipe_slow );
3411 %}
3412
3413 // Long could be loaded into xmm register directly from memory.
3414 instruct Repl2L_mem(vecX dst, memory mem) %{
3415 predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3416 match(Set dst (ReplicateL (LoadL mem)));
3417 format %{ "movq $dst,$mem\n\t"
3418 "punpcklqdq $dst,$dst\t! replicate2L" %}
3419 ins_encode %{
3420 __ movq($dst$$XMMRegister, $mem$$Address);
3421 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3422 %}
3423 ins_pipe( pipe_slow );
3424 %}
3425
3426 // Replicate long (8 byte) scalar to be vector
3427 #ifdef _LP64
3428 instruct Repl4L(vecY dst, rRegL src) %{
3429 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3430 match(Set dst (ReplicateL src));
3431 format %{ "movdq $dst,$src\n\t"
3432 "punpcklqdq $dst,$dst\n\t"
3433 "vinserti128_high $dst,$dst\t! replicate4L" %}
3434 ins_encode %{
3435 __ movdq($dst$$XMMRegister, $src$$Register);
3436 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3437 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3438 %}
3439 ins_pipe( pipe_slow );
3440 %}
3441 #else // _LP64
3442 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3443 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3444 match(Set dst (ReplicateL src));
3445 effect(TEMP dst, USE src, TEMP tmp);
3446 format %{ "movdl $dst,$src.lo\n\t"
3447 "movdl $tmp,$src.hi\n\t"
3448 "punpckldq $dst,$tmp\n\t"
3449 "punpcklqdq $dst,$dst\n\t"
3450 "vinserti128_high $dst,$dst\t! replicate4L" %}
3451 ins_encode %{
3452 __ movdl($dst$$XMMRegister, $src$$Register);
3453 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3454 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3455 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3456 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3457 %}
3458 ins_pipe( pipe_slow );
3459 %}
3460 #endif // _LP64
3461
3462 instruct Repl4L_imm(vecY dst, immL con) %{
3463 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3464 match(Set dst (ReplicateL con));
3465 format %{ "movq $dst,[$constantaddress]\n\t"
3466 "punpcklqdq $dst,$dst\n\t"
3467 "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3468 ins_encode %{
3469 __ movq($dst$$XMMRegister, $constantaddress($con));
3470 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3471 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3472 %}
3473 ins_pipe( pipe_slow );
3474 %}
3475
3476 instruct Repl4L_mem(vecY dst, memory mem) %{
3477 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3478 match(Set dst (ReplicateL (LoadL mem)));
3479 format %{ "movq $dst,$mem\n\t"
3480 "punpcklqdq $dst,$dst\n\t"
3481 "vinserti128_high $dst,$dst\t! replicate4L" %}
3482 ins_encode %{
3483 __ movq($dst$$XMMRegister, $mem$$Address);
3484 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3485 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3486 %}
3487 ins_pipe( pipe_slow );
3488 %}
3489
3490 instruct Repl2F_mem(vecD dst, memory mem) %{
3491 predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3492 match(Set dst (ReplicateF (LoadF mem)));
3493 format %{ "pshufd $dst,$mem,0x00\t! replicate2F" %}
3494 ins_encode %{
3495 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3496 %}
3497 ins_pipe( pipe_slow );
3498 %}
3499
3500 instruct Repl4F_mem(vecX dst, memory mem) %{
3501 predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3502 match(Set dst (ReplicateF (LoadF mem)));
3503 format %{ "pshufd $dst,$mem,0x00\t! replicate4F" %}
3504 ins_encode %{
3505 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3506 %}
3507 ins_pipe( pipe_slow );
3508 %}
3509
3510 instruct Repl8F(vecY dst, regF src) %{
3511 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3512 match(Set dst (ReplicateF src));
3513 format %{ "pshufd $dst,$src,0x00\n\t"
3514 "vinsertf128_high $dst,$dst\t! replicate8F" %}
3515 ins_encode %{
3516 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3517 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3518 %}
3519 ins_pipe( pipe_slow );
3520 %}
3521
3522 instruct Repl8F_mem(vecY dst, memory mem) %{
3523 predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3524 match(Set dst (ReplicateF (LoadF mem)));
3525 format %{ "pshufd $dst,$mem,0x00\n\t"
3526 "vinsertf128_high $dst,$dst\t! replicate8F" %}
3527 ins_encode %{
3528 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3529 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3530 %}
3531 ins_pipe( pipe_slow );
3532 %}
3533
3534 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3535 predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3536 match(Set dst (ReplicateF zero));
3537 format %{ "xorps $dst,$dst\t! replicate2F zero" %}
3538 ins_encode %{
3539 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3540 %}
3541 ins_pipe( fpu_reg_reg );
3542 %}
3543
3544 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3545 predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3546 match(Set dst (ReplicateF zero));
3547 format %{ "xorps $dst,$dst\t! replicate4F zero" %}
3548 ins_encode %{
3549 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3559 int vector_len = 1;
3560 __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3561 %}
3562 ins_pipe( fpu_reg_reg );
3563 %}
3564
3565 instruct Repl2D_mem(vecX dst, memory mem) %{
3566 predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3567 match(Set dst (ReplicateD (LoadD mem)));
3568 format %{ "pshufd $dst,$mem,0x44\t! replicate2D" %}
3569 ins_encode %{
3570 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3571 %}
3572 ins_pipe( pipe_slow );
3573 %}
3574
3575 instruct Repl4D(vecY dst, regD src) %{
3576 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3577 match(Set dst (ReplicateD src));
3578 format %{ "pshufd $dst,$src,0x44\n\t"
3579 "vinsertf128_high $dst,$dst\t! replicate4D" %}
3580 ins_encode %{
3581 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3582 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3583 %}
3584 ins_pipe( pipe_slow );
3585 %}
3586
3587 instruct Repl4D_mem(vecY dst, memory mem) %{
3588 predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3589 match(Set dst (ReplicateD (LoadD mem)));
3590 format %{ "pshufd $dst,$mem,0x44\n\t"
3591 "vinsertf128_high $dst,$dst\t! replicate4D" %}
3592 ins_encode %{
3593 __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3594 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3595 %}
3596 ins_pipe( pipe_slow );
3597 %}
3598
3599 // Replicate double (8 byte) scalar zero to be vector
3600 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3601 predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3602 match(Set dst (ReplicateD zero));
3603 format %{ "xorpd $dst,$dst\t! replicate2D zero" %}
3604 ins_encode %{
3605 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3606 %}
3607 ins_pipe( fpu_reg_reg );
3608 %}
3609
3610 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3611 predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3612 match(Set dst (ReplicateD zero));
3613 format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3614 ins_encode %{
4774 "movd $dst,$tmp2\t! add reduction4I" %}
4775 ins_encode %{
4776 int vector_len = 0;
4777 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4778 __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4779 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4780 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4781 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4782 __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4783 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4784 %}
4785 ins_pipe( pipe_slow );
4786 %}
4787
4788 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4789 predicate(VM_Version::supports_avxonly());
4790 match(Set dst (AddReductionVI src1 src2));
4791 effect(TEMP tmp, TEMP tmp2);
4792 format %{ "vphaddd $tmp,$src2,$src2\n\t"
4793 "vphaddd $tmp,$tmp,$tmp2\n\t"
4794 "vextracti128_high $tmp2,$tmp\n\t"
4795 "vpaddd $tmp,$tmp,$tmp2\n\t"
4796 "movd $tmp2,$src1\n\t"
4797 "vpaddd $tmp2,$tmp2,$tmp\n\t"
4798 "movd $dst,$tmp2\t! add reduction8I" %}
4799 ins_encode %{
4800 int vector_len = 1;
4801 __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4802 __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4803 __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4804 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4805 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4806 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4807 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4808 %}
4809 ins_pipe( pipe_slow );
4810 %}
4811
4812 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4813 predicate(UseAVX > 2);
4814 match(Set dst (AddReductionVI src1 src2));
4815 effect(TEMP tmp, TEMP tmp2);
4816 format %{ "vextracti128_high $tmp,$src2\n\t"
4817 "vpaddd $tmp,$tmp,$src2\n\t"
4818 "pshufd $tmp2,$tmp,0xE\n\t"
4819 "vpaddd $tmp,$tmp,$tmp2\n\t"
4820 "pshufd $tmp2,$tmp,0x1\n\t"
4821 "vpaddd $tmp,$tmp,$tmp2\n\t"
4822 "movd $tmp2,$src1\n\t"
4823 "vpaddd $tmp2,$tmp,$tmp2\n\t"
4824 "movd $dst,$tmp2\t! add reduction8I" %}
4825 ins_encode %{
4826 int vector_len = 0;
4827 __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4828 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4829 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4830 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4831 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4832 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4833 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4834 __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4835 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4836 %}
4837 ins_pipe( pipe_slow );
4838 %}
4839
4840 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4841 predicate(UseAVX > 2);
4842 match(Set dst (AddReductionVI src1 src2));
4843 effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4844 format %{ "vextracti64x4_high $tmp3,$src2\n\t"
4845 "vpaddd $tmp3,$tmp3,$src2\n\t"
4846 "vextracti128_high $tmp,$tmp3\n\t"
4847 "vpaddd $tmp,$tmp,$tmp3\n\t"
4848 "pshufd $tmp2,$tmp,0xE\n\t"
4849 "vpaddd $tmp,$tmp,$tmp2\n\t"
4850 "pshufd $tmp2,$tmp,0x1\n\t"
4851 "vpaddd $tmp,$tmp,$tmp2\n\t"
4852 "movd $tmp2,$src1\n\t"
4853 "vpaddd $tmp2,$tmp,$tmp2\n\t"
4854 "movd $dst,$tmp2\t! mul reduction16I" %}
4855 ins_encode %{
4856 __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
4857 __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4858 __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
4859 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4860 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4861 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4862 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4863 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4864 __ movdl($tmp2$$XMMRegister, $src1$$Register);
4865 __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4866 __ movdl($dst$$Register, $tmp2$$XMMRegister);
4867 %}
4868 ins_pipe( pipe_slow );
4869 %}
4870
4871 #ifdef _LP64
4872 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4873 predicate(UseAVX > 2);
4874 match(Set dst (AddReductionVL src1 src2));
4875 effect(TEMP tmp, TEMP tmp2);
4876 format %{ "pshufd $tmp2,$src2,0xE\n\t"
4877 "vpaddq $tmp,$src2,$tmp2\n\t"
4878 "movdq $tmp2,$src1\n\t"
4879 "vpaddq $tmp2,$tmp,$tmp2\n\t"
4880 "movdq $dst,$tmp2\t! add reduction2L" %}
4881 ins_encode %{
4882 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4883 __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4884 __ movdq($tmp2$$XMMRegister, $src1$$Register);
4885 __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4886 __ movdq($dst$$Register, $tmp2$$XMMRegister);
4887 %}
4888 ins_pipe( pipe_slow );
4889 %}
4890
4891 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4892 predicate(UseAVX > 2);
4893 match(Set dst (AddReductionVL src1 src2));
4894 effect(TEMP tmp, TEMP tmp2);
4895 format %{ "vextracti128_high $tmp,$src2\n\t"
4896 "vpaddq $tmp2,$tmp,$src2\n\t"
4897 "pshufd $tmp,$tmp2,0xE\n\t"
4898 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4899 "movdq $tmp,$src1\n\t"
4900 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4901 "movdq $dst,$tmp2\t! add reduction4L" %}
4902 ins_encode %{
4903 __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4904 __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4905 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4906 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4907 __ movdq($tmp$$XMMRegister, $src1$$Register);
4908 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4909 __ movdq($dst$$Register, $tmp2$$XMMRegister);
4910 %}
4911 ins_pipe( pipe_slow );
4912 %}
4913
4914 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4915 predicate(UseAVX > 2);
4916 match(Set dst (AddReductionVL src1 src2));
4917 effect(TEMP tmp, TEMP tmp2);
4918 format %{ "vextracti64x4_high $tmp2,$src2\n\t"
4919 "vpaddq $tmp2,$tmp2,$src2\n\t"
4920 "vextracti128_high $tmp,$tmp2\n\t"
4921 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4922 "pshufd $tmp,$tmp2,0xE\n\t"
4923 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4924 "movdq $tmp,$src1\n\t"
4925 "vpaddq $tmp2,$tmp2,$tmp\n\t"
4926 "movdq $dst,$tmp2\t! add reduction8L" %}
4927 ins_encode %{
4928 __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4929 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4930 __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
4931 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4932 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4933 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4934 __ movdq($tmp$$XMMRegister, $src1$$Register);
4935 __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4936 __ movdq($dst$$Register, $tmp2$$XMMRegister);
4937 %}
4938 ins_pipe( pipe_slow );
4939 %}
4940 #endif
4941
4942 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4943 predicate(UseSSE >= 1 && UseAVX == 0);
4944 match(Set dst (AddReductionVF dst src2));
4945 effect(TEMP dst, TEMP tmp);
4946 format %{ "addss $dst,$src2\n\t"
4947 "pshufd $tmp,$src2,0x01\n\t"
4948 "addss $dst,$tmp\t! add reduction2F" %}
4949 ins_encode %{
4950 __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5009 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5010 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5011 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5012 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5013 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5014 %}
5015 ins_pipe( pipe_slow );
5016 %}
5017
5018 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5019 predicate(UseAVX > 0);
5020 match(Set dst (AddReductionVF dst src2));
5021 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5022 format %{ "vaddss $dst,$dst,$src2\n\t"
5023 "pshufd $tmp,$src2,0x01\n\t"
5024 "vaddss $dst,$dst,$tmp\n\t"
5025 "pshufd $tmp,$src2,0x02\n\t"
5026 "vaddss $dst,$dst,$tmp\n\t"
5027 "pshufd $tmp,$src2,0x03\n\t"
5028 "vaddss $dst,$dst,$tmp\n\t"
5029 "vextractf128_high $tmp2,$src2\n\t"
5030 "vaddss $dst,$dst,$tmp2\n\t"
5031 "pshufd $tmp,$tmp2,0x01\n\t"
5032 "vaddss $dst,$dst,$tmp\n\t"
5033 "pshufd $tmp,$tmp2,0x02\n\t"
5034 "vaddss $dst,$dst,$tmp\n\t"
5035 "pshufd $tmp,$tmp2,0x03\n\t"
5036 "vaddss $dst,$dst,$tmp\t! add reduction8F" %}
5037 ins_encode %{
5038 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5039 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5040 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5041 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5042 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5043 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5044 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5045 __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5046 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5047 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5048 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5049 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5050 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5051 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5052 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5053 %}
5054 ins_pipe( pipe_slow );
5055 %}
5056
5057 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5058 predicate(UseAVX > 2);
5059 match(Set dst (AddReductionVF dst src2));
5060 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5061 format %{ "vaddss $dst,$dst,$src2\n\t"
5062 "pshufd $tmp,$src2,0x01\n\t"
5063 "vaddss $dst,$dst,$tmp\n\t"
5064 "pshufd $tmp,$src2,0x02\n\t"
5065 "vaddss $dst,$dst,$tmp\n\t"
5066 "pshufd $tmp,$src2,0x03\n\t"
5067 "vaddss $dst,$dst,$tmp\n\t"
5068 "vextractf32x4 $tmp2,$src2,0x1\n\t"
5069 "vaddss $dst,$dst,$tmp2\n\t"
5070 "pshufd $tmp,$tmp2,0x01\n\t"
5071 "vaddss $dst,$dst,$tmp\n\t"
5072 "pshufd $tmp,$tmp2,0x02\n\t"
5073 "vaddss $dst,$dst,$tmp\n\t"
5074 "pshufd $tmp,$tmp2,0x03\n\t"
5075 "vaddss $dst,$dst,$tmp\n\t"
5076 "vextractf32x4 $tmp2,$src2,0x2\n\t"
5077 "vaddss $dst,$dst,$tmp2\n\t"
5078 "pshufd $tmp,$tmp2,0x01\n\t"
5079 "vaddss $dst,$dst,$tmp\n\t"
5080 "pshufd $tmp,$tmp2,0x02\n\t"
5081 "vaddss $dst,$dst,$tmp\n\t"
5082 "pshufd $tmp,$tmp2,0x03\n\t"
5083 "vaddss $dst,$dst,$tmp\n\t"
5084 "vextractf32x4 $tmp2,$src2,0x3\n\t"
5085 "vaddss $dst,$dst,$tmp2\n\t"
5086 "pshufd $tmp,$tmp2,0x01\n\t"
5087 "vaddss $dst,$dst,$tmp\n\t"
5088 "pshufd $tmp,$tmp2,0x02\n\t"
5089 "vaddss $dst,$dst,$tmp\n\t"
5090 "pshufd $tmp,$tmp2,0x03\n\t"
5091 "vaddss $dst,$dst,$tmp\t! add reduction16F" %}
5092 ins_encode %{
5093 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5094 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5095 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5096 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5097 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5098 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5099 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5100 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5101 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5102 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5103 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5104 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5105 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5106 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5107 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5108 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5109 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5110 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5111 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5112 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5113 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5114 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5115 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5116 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5117 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5118 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5119 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5120 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5121 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5122 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5123 __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5124 %}
5125 ins_pipe( pipe_slow );
5126 %}
5127
5128 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5129 predicate(UseSSE >= 1 && UseAVX == 0);
5130 match(Set dst (AddReductionVD dst src2));
5131 effect(TEMP tmp, TEMP dst);
5132 format %{ "addsd $dst,$src2\n\t"
5133 "pshufd $tmp,$src2,0xE\n\t"
5134 "addsd $dst,$tmp\t! add reduction2D" %}
5135 ins_encode %{
5136 __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5145 match(Set dst (AddReductionVD dst src2));
5146 effect(TEMP tmp, TEMP dst);
5147 format %{ "vaddsd $dst,$dst,$src2\n\t"
5148 "pshufd $tmp,$src2,0xE\n\t"
5149 "vaddsd $dst,$dst,$tmp\t! add reduction2D" %}
5150 ins_encode %{
5151 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5152 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5153 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5154 %}
5155 ins_pipe( pipe_slow );
5156 %}
5157
5158 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5159 predicate(UseAVX > 0);
5160 match(Set dst (AddReductionVD dst src2));
5161 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5162 format %{ "vaddsd $dst,$dst,$src2\n\t"
5163 "pshufd $tmp,$src2,0xE\n\t"
5164 "vaddsd $dst,$dst,$tmp\n\t"
5165 "vextractf32x4 $tmp2,$src2,0x1\n\t"
5166 "vaddsd $dst,$dst,$tmp2\n\t"
5167 "pshufd $tmp,$tmp2,0xE\n\t"
5168 "vaddsd $dst,$dst,$tmp\t! add reduction4D" %}
5169 ins_encode %{
5170 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5171 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5172 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5173 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5174 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5175 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5176 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5177 %}
5178 ins_pipe( pipe_slow );
5179 %}
5180
5181 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5182 predicate(UseAVX > 2);
5183 match(Set dst (AddReductionVD dst src2));
5184 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5185 format %{ "vaddsd $dst,$dst,$src2\n\t"
5186 "pshufd $tmp,$src2,0xE\n\t"
5187 "vaddsd $dst,$dst,$tmp\n\t"
5188 "vextractf32x4 $tmp2,$src2,0x1\n\t"
5189 "vaddsd $dst,$dst,$tmp2\n\t"
5190 "pshufd $tmp,$tmp2,0xE\n\t"
5191 "vaddsd $dst,$dst,$tmp\n\t"
5192 "vextractf32x4 $tmp2,$src2,0x2\n\t"
5193 "vaddsd $dst,$dst,$tmp2\n\t"
5194 "pshufd $tmp,$tmp2,0xE\n\t"
5195 "vaddsd $dst,$dst,$tmp\n\t"
5196 "vextractf32x4 $tmp2,$src2,0x3\n\t"
5197 "vaddsd $dst,$dst,$tmp2\n\t"
5198 "pshufd $tmp,$tmp2,0xE\n\t"
5199 "vaddsd $dst,$dst,$tmp\t! add reduction8D" %}
5200 ins_encode %{
5201 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5202 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5203 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5204 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5205 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5206 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5207 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5208 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5209 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5210 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5211 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5212 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5213 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5214 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5215 __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5216 %}
5217 ins_pipe( pipe_slow );
5218 %}
5219
5220 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5221 predicate(UseSSE > 3 && UseAVX == 0);
5222 match(Set dst (MulReductionVI src1 src2));
5223 effect(TEMP tmp, TEMP tmp2);
5224 format %{ "pshufd $tmp2,$src2,0x1\n\t"
5225 "pmulld $tmp2,$src2\n\t"
5226 "movd $tmp,$src1\n\t"
5227 "pmulld $tmp2,$tmp\n\t"
5228 "movd $dst,$tmp2\t! mul reduction2I" %}
5229 ins_encode %{
5230 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5231 __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5232 __ movdl($tmp$$XMMRegister, $src1$$Register);
5290 "movd $tmp2,$src1\n\t"
5291 "vpmulld $tmp2,$tmp,$tmp2\n\t"
5292 "movd $dst,$tmp2\t! mul reduction4I" %}
5293 ins_encode %{
5294 int vector_len = 0;
5295 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5296 __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5297 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5298 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5299 __ movdl($tmp2$$XMMRegister, $src1$$Register);
5300 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5301 __ movdl($dst$$Register, $tmp2$$XMMRegister);
5302 %}
5303 ins_pipe( pipe_slow );
5304 %}
5305
5306 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5307 predicate(UseAVX > 0);
5308 match(Set dst (MulReductionVI src1 src2));
5309 effect(TEMP tmp, TEMP tmp2);
5310 format %{ "vextracti128_high $tmp,$src2\n\t"
5311 "vpmulld $tmp,$tmp,$src2\n\t"
5312 "pshufd $tmp2,$tmp,0xE\n\t"
5313 "vpmulld $tmp,$tmp,$tmp2\n\t"
5314 "pshufd $tmp2,$tmp,0x1\n\t"
5315 "vpmulld $tmp,$tmp,$tmp2\n\t"
5316 "movd $tmp2,$src1\n\t"
5317 "vpmulld $tmp2,$tmp,$tmp2\n\t"
5318 "movd $dst,$tmp2\t! mul reduction8I" %}
5319 ins_encode %{
5320 int vector_len = 0;
5321 __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5322 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5323 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5324 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5325 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5326 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5327 __ movdl($tmp2$$XMMRegister, $src1$$Register);
5328 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5329 __ movdl($dst$$Register, $tmp2$$XMMRegister);
5330 %}
5331 ins_pipe( pipe_slow );
5332 %}
5333
5334 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5335 predicate(UseAVX > 2);
5336 match(Set dst (MulReductionVI src1 src2));
5337 effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5338 format %{ "vextracti64x4_high $tmp3,$src2\n\t"
5339 "vpmulld $tmp3,$tmp3,$src2\n\t"
5340 "vextracti128_high $tmp,$tmp3\n\t"
5341 "vpmulld $tmp,$tmp,$src2\n\t"
5342 "pshufd $tmp2,$tmp,0xE\n\t"
5343 "vpmulld $tmp,$tmp,$tmp2\n\t"
5344 "pshufd $tmp2,$tmp,0x1\n\t"
5345 "vpmulld $tmp,$tmp,$tmp2\n\t"
5346 "movd $tmp2,$src1\n\t"
5347 "vpmulld $tmp2,$tmp,$tmp2\n\t"
5348 "movd $dst,$tmp2\t! mul reduction16I" %}
5349 ins_encode %{
5350 __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5351 __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5352 __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5353 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5354 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5355 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5356 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5357 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5358 __ movdl($tmp2$$XMMRegister, $src1$$Register);
5359 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5360 __ movdl($dst$$Register, $tmp2$$XMMRegister);
5361 %}
5362 ins_pipe( pipe_slow );
5363 %}
5364
5365 #ifdef _LP64
5366 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5367 predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5368 match(Set dst (MulReductionVL src1 src2));
5369 effect(TEMP tmp, TEMP tmp2);
5370 format %{ "pshufd $tmp2,$src2,0xE\n\t"
5371 "vpmullq $tmp,$src2,$tmp2\n\t"
5372 "movdq $tmp2,$src1\n\t"
5373 "vpmullq $tmp2,$tmp,$tmp2\n\t"
5374 "movdq $dst,$tmp2\t! mul reduction2L" %}
5375 ins_encode %{
5376 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5377 __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5378 __ movdq($tmp2$$XMMRegister, $src1$$Register);
5379 __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5380 __ movdq($dst$$Register, $tmp2$$XMMRegister);
5381 %}
5382 ins_pipe( pipe_slow );
5383 %}
5384
5385 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5386 predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5387 match(Set dst (MulReductionVL src1 src2));
5388 effect(TEMP tmp, TEMP tmp2);
5389 format %{ "vextracti128_high $tmp,$src2\n\t"
5390 "vpmullq $tmp2,$tmp,$src2\n\t"
5391 "pshufd $tmp,$tmp2,0xE\n\t"
5392 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5393 "movdq $tmp,$src1\n\t"
5394 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5395 "movdq $dst,$tmp2\t! mul reduction4L" %}
5396 ins_encode %{
5397 __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5398 __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5399 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5400 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5401 __ movdq($tmp$$XMMRegister, $src1$$Register);
5402 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5403 __ movdq($dst$$Register, $tmp2$$XMMRegister);
5404 %}
5405 ins_pipe( pipe_slow );
5406 %}
5407
5408 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5409 predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5410 match(Set dst (MulReductionVL src1 src2));
5411 effect(TEMP tmp, TEMP tmp2);
5412 format %{ "vextracti64x4_high $tmp2,$src2\n\t"
5413 "vpmullq $tmp2,$tmp2,$src2\n\t"
5414 "vextracti128_high $tmp,$tmp2\n\t"
5415 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5416 "pshufd $tmp,$tmp2,0xE\n\t"
5417 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5418 "movdq $tmp,$src1\n\t"
5419 "vpmullq $tmp2,$tmp2,$tmp\n\t"
5420 "movdq $dst,$tmp2\t! mul reduction8L" %}
5421 ins_encode %{
5422 __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5423 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5424 __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5425 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5426 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5427 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5428 __ movdq($tmp$$XMMRegister, $src1$$Register);
5429 __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5430 __ movdq($dst$$Register, $tmp2$$XMMRegister);
5431 %}
5432 ins_pipe( pipe_slow );
5433 %}
5434 #endif
5435
5436 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5437 predicate(UseSSE >= 1 && UseAVX == 0);
5438 match(Set dst (MulReductionVF dst src2));
5439 effect(TEMP dst, TEMP tmp);
5440 format %{ "mulss $dst,$src2\n\t"
5441 "pshufd $tmp,$src2,0x01\n\t"
5442 "mulss $dst,$tmp\t! mul reduction2F" %}
5443 ins_encode %{
5444 __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5503 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5504 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5505 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5506 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5507 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5508 %}
5509 ins_pipe( pipe_slow );
5510 %}
5511
5512 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5513 predicate(UseAVX > 0);
5514 match(Set dst (MulReductionVF dst src2));
5515 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5516 format %{ "vmulss $dst,$dst,$src2\n\t"
5517 "pshufd $tmp,$src2,0x01\n\t"
5518 "vmulss $dst,$dst,$tmp\n\t"
5519 "pshufd $tmp,$src2,0x02\n\t"
5520 "vmulss $dst,$dst,$tmp\n\t"
5521 "pshufd $tmp,$src2,0x03\n\t"
5522 "vmulss $dst,$dst,$tmp\n\t"
5523 "vextractf128_high $tmp2,$src2\n\t"
5524 "vmulss $dst,$dst,$tmp2\n\t"
5525 "pshufd $tmp,$tmp2,0x01\n\t"
5526 "vmulss $dst,$dst,$tmp\n\t"
5527 "pshufd $tmp,$tmp2,0x02\n\t"
5528 "vmulss $dst,$dst,$tmp\n\t"
5529 "pshufd $tmp,$tmp2,0x03\n\t"
5530 "vmulss $dst,$dst,$tmp\t! mul reduction8F" %}
5531 ins_encode %{
5532 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5533 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5534 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5535 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5536 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5537 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5538 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5539 __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5540 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5541 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5542 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5543 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5544 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5545 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5546 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5547 %}
5548 ins_pipe( pipe_slow );
5549 %}
5550
5551 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5552 predicate(UseAVX > 2);
5553 match(Set dst (MulReductionVF dst src2));
5554 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5555 format %{ "vmulss $dst,$dst,$src2\n\t"
5556 "pshufd $tmp,$src2,0x01\n\t"
5557 "vmulss $dst,$dst,$tmp\n\t"
5558 "pshufd $tmp,$src2,0x02\n\t"
5559 "vmulss $dst,$dst,$tmp\n\t"
5560 "pshufd $tmp,$src2,0x03\n\t"
5561 "vmulss $dst,$dst,$tmp\n\t"
5562 "vextractf32x4 $tmp2,$src2,0x1\n\t"
5563 "vmulss $dst,$dst,$tmp2\n\t"
5564 "pshufd $tmp,$tmp2,0x01\n\t"
5565 "vmulss $dst,$dst,$tmp\n\t"
5566 "pshufd $tmp,$tmp2,0x02\n\t"
5567 "vmulss $dst,$dst,$tmp\n\t"
5568 "pshufd $tmp,$tmp2,0x03\n\t"
5569 "vmulss $dst,$dst,$tmp\n\t"
5570 "vextractf32x4 $tmp2,$src2,0x2\n\t"
5571 "vmulss $dst,$dst,$tmp2\n\t"
5572 "pshufd $tmp,$tmp2,0x01\n\t"
5573 "vmulss $dst,$dst,$tmp\n\t"
5574 "pshufd $tmp,$tmp2,0x02\n\t"
5575 "vmulss $dst,$dst,$tmp\n\t"
5576 "pshufd $tmp,$tmp2,0x03\n\t"
5577 "vmulss $dst,$dst,$tmp\n\t"
5578 "vextractf32x4 $tmp2,$src2,0x3\n\t"
5579 "vmulss $dst,$dst,$tmp2\n\t"
5580 "pshufd $tmp,$tmp2,0x01\n\t"
5581 "vmulss $dst,$dst,$tmp\n\t"
5582 "pshufd $tmp,$tmp2,0x02\n\t"
5583 "vmulss $dst,$dst,$tmp\n\t"
5584 "pshufd $tmp,$tmp2,0x03\n\t"
5585 "vmulss $dst,$dst,$tmp\t! mul reduction16F" %}
5586 ins_encode %{
5587 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5588 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5589 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5590 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5591 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5592 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5593 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5594 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5595 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5596 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5597 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5598 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5599 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5600 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5601 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5602 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5603 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5604 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5605 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5606 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5607 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5608 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5609 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5610 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5611 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5612 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5613 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5614 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5615 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5616 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5617 __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5618 %}
5619 ins_pipe( pipe_slow );
5620 %}
5621
5622 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5623 predicate(UseSSE >= 1 && UseAVX == 0);
5624 match(Set dst (MulReductionVD dst src2));
5625 effect(TEMP dst, TEMP tmp);
5626 format %{ "mulsd $dst,$src2\n\t"
5627 "pshufd $tmp,$src2,0xE\n\t"
5628 "mulsd $dst,$tmp\t! mul reduction2D" %}
5629 ins_encode %{
5630 __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5639 match(Set dst (MulReductionVD dst src2));
5640 effect(TEMP tmp, TEMP dst);
5641 format %{ "vmulsd $dst,$dst,$src2\n\t"
5642 "pshufd $tmp,$src2,0xE\n\t"
5643 "vmulsd $dst,$dst,$tmp\t! mul reduction2D" %}
5644 ins_encode %{
5645 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5646 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5647 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5648 %}
5649 ins_pipe( pipe_slow );
5650 %}
5651
5652 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5653 predicate(UseAVX > 0);
5654 match(Set dst (MulReductionVD dst src2));
5655 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5656 format %{ "vmulsd $dst,$dst,$src2\n\t"
5657 "pshufd $tmp,$src2,0xE\n\t"
5658 "vmulsd $dst,$dst,$tmp\n\t"
5659 "vextractf128_high $tmp2,$src2\n\t"
5660 "vmulsd $dst,$dst,$tmp2\n\t"
5661 "pshufd $tmp,$tmp2,0xE\n\t"
5662 "vmulsd $dst,$dst,$tmp\t! mul reduction4D" %}
5663 ins_encode %{
5664 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5665 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5666 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5667 __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5668 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5669 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5670 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5671 %}
5672 ins_pipe( pipe_slow );
5673 %}
5674
5675 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5676 predicate(UseAVX > 2);
5677 match(Set dst (MulReductionVD dst src2));
5678 effect(TEMP tmp, TEMP dst, TEMP tmp2);
5679 format %{ "vmulsd $dst,$dst,$src2\n\t"
5680 "pshufd $tmp,$src2,0xE\n\t"
5681 "vmulsd $dst,$dst,$tmp\n\t"
5682 "vextractf32x4 $tmp2,$src2,0x1\n\t"
5683 "vmulsd $dst,$dst,$tmp2\n\t"
5684 "pshufd $tmp,$src2,0xE\n\t"
5685 "vmulsd $dst,$dst,$tmp\n\t"
5686 "vextractf32x4 $tmp2,$src2,0x2\n\t"
5687 "vmulsd $dst,$dst,$tmp2\n\t"
5688 "pshufd $tmp,$tmp2,0xE\n\t"
5689 "vmulsd $dst,$dst,$tmp\n\t"
5690 "vextractf32x4 $tmp2,$src2,0x3\n\t"
5691 "vmulsd $dst,$dst,$tmp2\n\t"
5692 "pshufd $tmp,$tmp2,0xE\n\t"
5693 "vmulsd $dst,$dst,$tmp\t! mul reduction8D" %}
5694 ins_encode %{
5695 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5696 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5697 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5698 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5699 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5700 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5701 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5702 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5703 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5704 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5705 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5706 __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5707 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5708 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5709 __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5710 %}
5711 ins_pipe( pipe_slow );
5712 %}
5713
5714 // ====================VECTOR ARITHMETIC=======================================
5715
5716 // --------------------------------- ADD --------------------------------------
5717
5718 // Bytes vector add
5719 instruct vadd4B(vecS dst, vecS src) %{
5720 predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5721 match(Set dst (AddVB dst src));
5722 format %{ "paddb $dst,$src\t! add packed4B" %}
5723 ins_encode %{
5724 __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5725 %}
5726 ins_pipe( pipe_slow );
|