< prev index next >

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page




9412   } else {
9413     movl(x_xstart, Address(x,  0));
9414   }
9415   jmp(L_third_loop_prologue);
9416 
9417   bind(L_done);
9418 
9419   pop(zlen);
9420   pop(xlen);
9421 
9422   pop(tmp5);
9423   pop(tmp4);
9424   pop(tmp3);
9425   pop(tmp2);
9426   pop(tmp1);
9427 }
9428 
9429 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
9430   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
9431   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");

9432   Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
9433   Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
9434   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
9435   Label SAME_TILL_END, DONE;
9436   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
9437 
9438   //scale is in rcx in both Win64 and Unix
9439   ShortBranchVerifier sbv(this);
9440 
9441   shlq(length);
9442   xorq(result, result);
9443 



















































9444   cmpq(length, 8);
9445   jcc(Assembler::equal, VECTOR8_LOOP);
9446   jcc(Assembler::less, VECTOR4_TAIL);
9447 
9448   if (UseAVX >= 2){
9449 
9450     cmpq(length, 16);
9451     jcc(Assembler::equal, VECTOR16_LOOP);
9452     jcc(Assembler::less, VECTOR8_LOOP);
9453 
9454     cmpq(length, 32);
9455     jccb(Assembler::less, VECTOR16_TAIL);
9456 
9457     subq(length, 32);
9458     bind(VECTOR32_LOOP);
9459     vmovdqu(rymm0, Address(obja, result));
9460     vmovdqu(rymm1, Address(objb, result));
9461     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
9462     vptest(rymm2, rymm2);
9463     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
9464     addq(result, 32);
9465     subq(length, 32);
9466     jccb(Assembler::greaterEqual, VECTOR32_LOOP);
9467     addq(length, 32);
9468     jcc(Assembler::equal, SAME_TILL_END);


9536   testl(tmp1, tmp1);
9537   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9538   decq(length);
9539   jccb(Assembler::zero, SAME_TILL_END);
9540   incq(result);
9541   load_unsigned_byte(tmp1, Address(obja, result));
9542   load_unsigned_byte(tmp2, Address(objb, result));
9543   xorl(tmp1, tmp2);
9544   testl(tmp1, tmp1);
9545   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9546   decq(length);
9547   jccb(Assembler::zero, SAME_TILL_END);
9548   incq(result);
9549   load_unsigned_byte(tmp1, Address(obja, result));
9550   load_unsigned_byte(tmp2, Address(objb, result));
9551   xorl(tmp1, tmp2);
9552   testl(tmp1, tmp1);
9553   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9554   jmpb(SAME_TILL_END);
9555 
9556   if (UseAVX >= 2){
9557     bind(VECTOR32_NOT_EQUAL);
9558     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
9559     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
9560     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
9561     vpmovmskb(tmp1, rymm0);
9562     bsfq(tmp1, tmp1);
9563     addq(result, tmp1);
9564     shrq(result);
9565     jmpb(DONE);
9566   }
9567 
9568   bind(VECTOR16_NOT_EQUAL);
9569   if (UseAVX >= 2){
9570     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
9571     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
9572     pxor(rymm0, rymm2);
9573   } else {
9574     pcmpeqb(rymm2, rymm2);
9575     pxor(rymm0, rymm1);
9576     pcmpeqb(rymm0, rymm1);
9577     pxor(rymm0, rymm2);
9578   }
9579   pmovmskb(tmp1, rymm0);
9580   bsfq(tmp1, tmp1);
9581   addq(result, tmp1);
9582   shrq(result);
9583   jmpb(DONE);
9584 
9585   bind(VECTOR8_NOT_EQUAL);
9586   bind(VECTOR4_NOT_EQUAL);
9587   bsfq(tmp1, tmp1);
9588   shrq(tmp1, 3);
9589   addq(result, tmp1);
9590   bind(BYTES_NOT_EQUAL);
9591   shrq(result);
9592   jmpb(DONE);
9593 
9594   bind(SAME_TILL_END);
9595   mov64(result, -1);
9596 
9597   bind(DONE);
9598 }
9599 
9600 
9601 //Helper functions for square_to_len()
9602 
9603 /**
9604  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9605  * Preserves x and z and modifies rest of the registers.
9606  */
9607 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9608   // Perform square and right shift by 1
9609   // Handle odd xlen case first, then for even xlen do the following
9610   // jlong carry = 0;
9611   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9612   //     huge_128 product = x[j:j+1] * x[j:j+1];
9613   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9614   //     z[i+2:i+3] = (jlong)(product >>> 1);
9615   //     carry = (jlong)product;
9616   // }
9617 
9618   xorq(tmp5, tmp5);     // carry
9619   xorq(rdxReg, rdxReg);




9412   } else {
9413     movl(x_xstart, Address(x,  0));
9414   }
9415   jmp(L_third_loop_prologue);
9416 
9417   bind(L_done);
9418 
9419   pop(zlen);
9420   pop(xlen);
9421 
9422   pop(tmp5);
9423   pop(tmp4);
9424   pop(tmp3);
9425   pop(tmp2);
9426   pop(tmp1);
9427 }
9428 
9429 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
9430   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
9431   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
9432   Label VECTOR64_LOOP, VECTOR64_TAIL, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
9433   Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
9434   Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
9435   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
9436   Label SAME_TILL_END, DONE;
9437   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
9438 
9439   //scale is in rcx in both Win64 and Unix
9440   ShortBranchVerifier sbv(this);
9441 
9442   shlq(length);
9443   xorq(result, result);
9444 
9445   if ((UseAVX > 2) &&
9446       VM_Version::supports_avx512vlbw()) {
9447     set_programmed_mask_reg();  // opening of the stub context for programming mask registers
9448     cmpq(length, 64);
9449     jcc(Assembler::less, VECTOR32_TAIL);
9450     movq(tmp1, length);
9451     andq(tmp1, 0x3F);      // tail count
9452     andq(length, ~(0x3F)); //vector count
9453 
9454     bind(VECTOR64_LOOP);
9455     // AVX512 code to compare 64 byte vectors.
9456     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
9457     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
9458     kortestql(k7, k7);
9459     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
9460     addq(result, 64);
9461     subq(length, 64);
9462     jccb(Assembler::notZero, VECTOR64_LOOP);
9463 
9464     //bind(VECTOR64_TAIL);
9465     testq(tmp1, tmp1);
9466     jcc(Assembler::zero, SAME_TILL_END);
9467 
9468     bind(VECTOR64_TAIL);
9469     // AVX512 code to compare upto 63 byte vectors.
9470     // Save k1
9471     kmovql(k3, k1);
9472     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
9473     shlxq(tmp2, tmp2, tmp1);
9474     notq(tmp2);
9475     kmovql(k1, tmp2);
9476 
9477     evmovdqub(k1, false, rymm0, Address(obja, result), Assembler::AVX_512bit);
9478     evpcmpeqb(k1, false, k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
9479 
9480     ktestql(k7, k1);
9481     // Restore k1
9482     kmovql(k1, k3);
9483     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
9484 
9485     bind(VECTOR64_NOT_EQUAL);
9486     kmovql(tmp1, k7);
9487     notq(tmp1);
9488     tzcntq(tmp1, tmp1);
9489     addq(result, tmp1);
9490     shrq(result);
9491     jmp(DONE);
9492     bind(VECTOR32_TAIL);
9493     clear_programmed_mask_reg();   // closing of the stub context for programming mask registers
9494   }
9495 
9496   cmpq(length, 8);
9497   jcc(Assembler::equal, VECTOR8_LOOP);
9498   jcc(Assembler::less, VECTOR4_TAIL);
9499 
9500   if (UseAVX >= 2) {
9501 
9502     cmpq(length, 16);
9503     jcc(Assembler::equal, VECTOR16_LOOP);
9504     jcc(Assembler::less, VECTOR8_LOOP);
9505 
9506     cmpq(length, 32);
9507     jccb(Assembler::less, VECTOR16_TAIL);
9508 
9509     subq(length, 32);
9510     bind(VECTOR32_LOOP);
9511     vmovdqu(rymm0, Address(obja, result));
9512     vmovdqu(rymm1, Address(objb, result));
9513     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
9514     vptest(rymm2, rymm2);
9515     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
9516     addq(result, 32);
9517     subq(length, 32);
9518     jccb(Assembler::greaterEqual, VECTOR32_LOOP);
9519     addq(length, 32);
9520     jcc(Assembler::equal, SAME_TILL_END);


9588   testl(tmp1, tmp1);
9589   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9590   decq(length);
9591   jccb(Assembler::zero, SAME_TILL_END);
9592   incq(result);
9593   load_unsigned_byte(tmp1, Address(obja, result));
9594   load_unsigned_byte(tmp2, Address(objb, result));
9595   xorl(tmp1, tmp2);
9596   testl(tmp1, tmp1);
9597   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9598   decq(length);
9599   jccb(Assembler::zero, SAME_TILL_END);
9600   incq(result);
9601   load_unsigned_byte(tmp1, Address(obja, result));
9602   load_unsigned_byte(tmp2, Address(objb, result));
9603   xorl(tmp1, tmp2);
9604   testl(tmp1, tmp1);
9605   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9606   jmpb(SAME_TILL_END);
9607 
9608   if (UseAVX >= 2) {
9609     bind(VECTOR32_NOT_EQUAL);
9610     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
9611     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
9612     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
9613     vpmovmskb(tmp1, rymm0);
9614     bsfq(tmp1, tmp1);
9615     addq(result, tmp1);
9616     shrq(result);
9617     jmpb(DONE);
9618   }
9619 
9620   bind(VECTOR16_NOT_EQUAL);
9621   if (UseAVX >= 2) {
9622     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
9623     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
9624     pxor(rymm0, rymm2);
9625   } else {
9626     pcmpeqb(rymm2, rymm2);
9627     pxor(rymm0, rymm1);
9628     pcmpeqb(rymm0, rymm1);
9629     pxor(rymm0, rymm2);
9630   }
9631   pmovmskb(tmp1, rymm0);
9632   bsfq(tmp1, tmp1);
9633   addq(result, tmp1);
9634   shrq(result);
9635   jmpb(DONE);
9636 
9637   bind(VECTOR8_NOT_EQUAL);
9638   bind(VECTOR4_NOT_EQUAL);
9639   bsfq(tmp1, tmp1);
9640   shrq(tmp1, 3);
9641   addq(result, tmp1);
9642   bind(BYTES_NOT_EQUAL);
9643   shrq(result);
9644   jmpb(DONE);
9645 
9646   bind(SAME_TILL_END);
9647   mov64(result, -1);
9648 
9649   bind(DONE);
9650 }

9651 
9652 //Helper functions for square_to_len()
9653 
9654 /**
9655  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9656  * Preserves x and z and modifies rest of the registers.
9657  */
9658 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9659   // Perform square and right shift by 1
9660   // Handle odd xlen case first, then for even xlen do the following
9661   // jlong carry = 0;
9662   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9663   //     huge_128 product = x[j:j+1] * x[j:j+1];
9664   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9665   //     z[i+2:i+3] = (jlong)(product >>> 1);
9666   //     carry = (jlong)product;
9667   // }
9668 
9669   xorq(tmp5, tmp5);     // carry
9670   xorq(rdxReg, rdxReg);


< prev index next >