9412 } else {
9413 movl(x_xstart, Address(x, 0));
9414 }
9415 jmp(L_third_loop_prologue);
9416
9417 bind(L_done);
9418
9419 pop(zlen);
9420 pop(xlen);
9421
9422 pop(tmp5);
9423 pop(tmp4);
9424 pop(tmp3);
9425 pop(tmp2);
9426 pop(tmp1);
9427 }
9428
9429 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
9430 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
9431 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
9432 Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
9433 Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
9434 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
9435 Label SAME_TILL_END, DONE;
9436 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
9437
9438 //scale is in rcx in both Win64 and Unix
9439 ShortBranchVerifier sbv(this);
9440
9441 shlq(length);
9442 xorq(result, result);
9443
9444 cmpq(length, 8);
9445 jcc(Assembler::equal, VECTOR8_LOOP);
9446 jcc(Assembler::less, VECTOR4_TAIL);
9447
9448 if (UseAVX >= 2){
9449
9450 cmpq(length, 16);
9451 jcc(Assembler::equal, VECTOR16_LOOP);
9452 jcc(Assembler::less, VECTOR8_LOOP);
9453
9454 cmpq(length, 32);
9455 jccb(Assembler::less, VECTOR16_TAIL);
9456
9457 subq(length, 32);
9458 bind(VECTOR32_LOOP);
9459 vmovdqu(rymm0, Address(obja, result));
9460 vmovdqu(rymm1, Address(objb, result));
9461 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
9462 vptest(rymm2, rymm2);
9463 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
9464 addq(result, 32);
9465 subq(length, 32);
9466 jccb(Assembler::greaterEqual, VECTOR32_LOOP);
9467 addq(length, 32);
9468 jcc(Assembler::equal, SAME_TILL_END);
9536 testl(tmp1, tmp1);
9537 jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9538 decq(length);
9539 jccb(Assembler::zero, SAME_TILL_END);
9540 incq(result);
9541 load_unsigned_byte(tmp1, Address(obja, result));
9542 load_unsigned_byte(tmp2, Address(objb, result));
9543 xorl(tmp1, tmp2);
9544 testl(tmp1, tmp1);
9545 jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9546 decq(length);
9547 jccb(Assembler::zero, SAME_TILL_END);
9548 incq(result);
9549 load_unsigned_byte(tmp1, Address(obja, result));
9550 load_unsigned_byte(tmp2, Address(objb, result));
9551 xorl(tmp1, tmp2);
9552 testl(tmp1, tmp1);
9553 jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9554 jmpb(SAME_TILL_END);
9555
9556 if (UseAVX >= 2){
9557 bind(VECTOR32_NOT_EQUAL);
9558 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
9559 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
9560 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
9561 vpmovmskb(tmp1, rymm0);
9562 bsfq(tmp1, tmp1);
9563 addq(result, tmp1);
9564 shrq(result);
9565 jmpb(DONE);
9566 }
9567
9568 bind(VECTOR16_NOT_EQUAL);
9569 if (UseAVX >= 2){
9570 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
9571 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
9572 pxor(rymm0, rymm2);
9573 } else {
9574 pcmpeqb(rymm2, rymm2);
9575 pxor(rymm0, rymm1);
9576 pcmpeqb(rymm0, rymm1);
9577 pxor(rymm0, rymm2);
9578 }
9579 pmovmskb(tmp1, rymm0);
9580 bsfq(tmp1, tmp1);
9581 addq(result, tmp1);
9582 shrq(result);
9583 jmpb(DONE);
9584
9585 bind(VECTOR8_NOT_EQUAL);
9586 bind(VECTOR4_NOT_EQUAL);
9587 bsfq(tmp1, tmp1);
9588 shrq(tmp1, 3);
9589 addq(result, tmp1);
9590 bind(BYTES_NOT_EQUAL);
9591 shrq(result);
9592 jmpb(DONE);
9593
9594 bind(SAME_TILL_END);
9595 mov64(result, -1);
9596
9597 bind(DONE);
9598 }
9599
9600
9601 //Helper functions for square_to_len()
9602
9603 /**
9604 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9605 * Preserves x and z and modifies rest of the registers.
9606 */
9607 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9608 // Perform square and right shift by 1
9609 // Handle odd xlen case first, then for even xlen do the following
9610 // jlong carry = 0;
9611 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9612 // huge_128 product = x[j:j+1] * x[j:j+1];
9613 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9614 // z[i+2:i+3] = (jlong)(product >>> 1);
9615 // carry = (jlong)product;
9616 // }
9617
9618 xorq(tmp5, tmp5); // carry
9619 xorq(rdxReg, rdxReg);
|
9412 } else {
9413 movl(x_xstart, Address(x, 0));
9414 }
9415 jmp(L_third_loop_prologue);
9416
9417 bind(L_done);
9418
9419 pop(zlen);
9420 pop(xlen);
9421
9422 pop(tmp5);
9423 pop(tmp4);
9424 pop(tmp3);
9425 pop(tmp2);
9426 pop(tmp1);
9427 }
9428
9429 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
9430 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
9431 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
9432 Label VECTOR64_LOOP, VECTOR64_TAIL, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
9433 Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
9434 Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
9435 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
9436 Label SAME_TILL_END, DONE;
9437 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
9438
9439 //scale is in rcx in both Win64 and Unix
9440 ShortBranchVerifier sbv(this);
9441
9442 shlq(length);
9443 xorq(result, result);
9444
9445 if ((UseAVX > 2) &&
9446 VM_Version::supports_avx512vlbw()) {
9447 set_programmed_mask_reg(); // opening of the stub context for programming mask registers
9448 cmpq(length, 64);
9449 jcc(Assembler::less, VECTOR32_TAIL);
9450 movq(tmp1, length);
9451 andq(tmp1, 0x3F); // tail count
9452 andq(length, ~(0x3F)); //vector count
9453
9454 bind(VECTOR64_LOOP);
9455 // AVX512 code to compare 64 byte vectors.
9456 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
9457 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
9458 kortestql(k7, k7);
9459 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
9460 addq(result, 64);
9461 subq(length, 64);
9462 jccb(Assembler::notZero, VECTOR64_LOOP);
9463
9464 //bind(VECTOR64_TAIL);
9465 testq(tmp1, tmp1);
9466 jcc(Assembler::zero, SAME_TILL_END);
9467
9468 bind(VECTOR64_TAIL);
9469 // AVX512 code to compare upto 63 byte vectors.
9470 // Save k1
9471 kmovql(k3, k1);
9472 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
9473 shlxq(tmp2, tmp2, tmp1);
9474 notq(tmp2);
9475 kmovql(k1, tmp2);
9476
9477 evmovdqub(k1, false, rymm0, Address(obja, result), Assembler::AVX_512bit);
9478 evpcmpeqb(k1, false, k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
9479
9480 ktestql(k7, k1);
9481 // Restore k1
9482 kmovql(k1, k3);
9483 jcc(Assembler::below, SAME_TILL_END); // not mismatch
9484
9485 bind(VECTOR64_NOT_EQUAL);
9486 kmovql(tmp1, k7);
9487 notq(tmp1);
9488 tzcntq(tmp1, tmp1);
9489 addq(result, tmp1);
9490 shrq(result);
9491 jmp(DONE);
9492 bind(VECTOR32_TAIL);
9493 clear_programmed_mask_reg(); // closing of the stub context for programming mask registers
9494 }
9495
9496 cmpq(length, 8);
9497 jcc(Assembler::equal, VECTOR8_LOOP);
9498 jcc(Assembler::less, VECTOR4_TAIL);
9499
9500 if (UseAVX >= 2) {
9501
9502 cmpq(length, 16);
9503 jcc(Assembler::equal, VECTOR16_LOOP);
9504 jcc(Assembler::less, VECTOR8_LOOP);
9505
9506 cmpq(length, 32);
9507 jccb(Assembler::less, VECTOR16_TAIL);
9508
9509 subq(length, 32);
9510 bind(VECTOR32_LOOP);
9511 vmovdqu(rymm0, Address(obja, result));
9512 vmovdqu(rymm1, Address(objb, result));
9513 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
9514 vptest(rymm2, rymm2);
9515 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
9516 addq(result, 32);
9517 subq(length, 32);
9518 jccb(Assembler::greaterEqual, VECTOR32_LOOP);
9519 addq(length, 32);
9520 jcc(Assembler::equal, SAME_TILL_END);
9588 testl(tmp1, tmp1);
9589 jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9590 decq(length);
9591 jccb(Assembler::zero, SAME_TILL_END);
9592 incq(result);
9593 load_unsigned_byte(tmp1, Address(obja, result));
9594 load_unsigned_byte(tmp2, Address(objb, result));
9595 xorl(tmp1, tmp2);
9596 testl(tmp1, tmp1);
9597 jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9598 decq(length);
9599 jccb(Assembler::zero, SAME_TILL_END);
9600 incq(result);
9601 load_unsigned_byte(tmp1, Address(obja, result));
9602 load_unsigned_byte(tmp2, Address(objb, result));
9603 xorl(tmp1, tmp2);
9604 testl(tmp1, tmp1);
9605 jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9606 jmpb(SAME_TILL_END);
9607
9608 if (UseAVX >= 2) {
9609 bind(VECTOR32_NOT_EQUAL);
9610 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
9611 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
9612 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
9613 vpmovmskb(tmp1, rymm0);
9614 bsfq(tmp1, tmp1);
9615 addq(result, tmp1);
9616 shrq(result);
9617 jmpb(DONE);
9618 }
9619
9620 bind(VECTOR16_NOT_EQUAL);
9621 if (UseAVX >= 2) {
9622 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
9623 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
9624 pxor(rymm0, rymm2);
9625 } else {
9626 pcmpeqb(rymm2, rymm2);
9627 pxor(rymm0, rymm1);
9628 pcmpeqb(rymm0, rymm1);
9629 pxor(rymm0, rymm2);
9630 }
9631 pmovmskb(tmp1, rymm0);
9632 bsfq(tmp1, tmp1);
9633 addq(result, tmp1);
9634 shrq(result);
9635 jmpb(DONE);
9636
9637 bind(VECTOR8_NOT_EQUAL);
9638 bind(VECTOR4_NOT_EQUAL);
9639 bsfq(tmp1, tmp1);
9640 shrq(tmp1, 3);
9641 addq(result, tmp1);
9642 bind(BYTES_NOT_EQUAL);
9643 shrq(result);
9644 jmpb(DONE);
9645
9646 bind(SAME_TILL_END);
9647 mov64(result, -1);
9648
9649 bind(DONE);
9650 }
9651
9652 //Helper functions for square_to_len()
9653
9654 /**
9655 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9656 * Preserves x and z and modifies rest of the registers.
9657 */
9658 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9659 // Perform square and right shift by 1
9660 // Handle odd xlen case first, then for even xlen do the following
9661 // jlong carry = 0;
9662 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9663 // huge_128 product = x[j:j+1] * x[j:j+1];
9664 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9665 // z[i+2:i+3] = (jlong)(product >>> 1);
9666 // carry = (jlong)product;
9667 // }
9668
9669 xorq(tmp5, tmp5); // carry
9670 xorq(rdxReg, rdxReg);
|