< prev index next >

hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page
rev 7347 : 8078113: 8011102 changes may cause incorrect results
Summary: replace Vzeroupper instruction in stubs with zeroing only used ymm registers.
Reviewed-by: kvn
Contributed-by: sandhya.viswanathan@intel.com


6673 
6674     // Setup the registers to start vector comparison loop
6675     bind(COMPARE_WIDE_VECTORS);
6676     lea(str1, Address(str1, result, scale));
6677     lea(str2, Address(str2, result, scale));
6678     subl(result, stride2);
6679     subl(cnt2, stride2);
6680     jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6681     negptr(result);
6682 
6683     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6684     bind(COMPARE_WIDE_VECTORS_LOOP);
6685     vmovdqu(vec1, Address(str1, result, scale));
6686     vpxor(vec1, Address(str2, result, scale));
6687     vptest(vec1, vec1);
6688     jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6689     addptr(result, stride2);
6690     subl(cnt2, stride2);
6691     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6692     // clean upper bits of YMM registers
6693     vzeroupper();
6694 
6695     // compare wide vectors tail
6696     bind(COMPARE_WIDE_TAIL);
6697     testptr(result, result);
6698     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6699 
6700     movl(result, stride2);
6701     movl(cnt2, result);
6702     negptr(result);
6703     jmpb(COMPARE_WIDE_VECTORS_LOOP);
6704 
6705     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6706     bind(VECTOR_NOT_EQUAL);
6707     // clean upper bits of YMM registers
6708     vzeroupper();
6709     lea(str1, Address(str1, result, scale));
6710     lea(str2, Address(str2, result, scale));
6711     jmp(COMPARE_16_CHARS);
6712 
6713     // Compare tail chars, length between 1 to 15 chars
6714     bind(COMPARE_TAIL_LONG);
6715     movl(cnt2, result);
6716     cmpl(cnt2, stride);
6717     jccb(Assembler::less, COMPARE_SMALL_STR);
6718 
6719     movdqu(vec1, Address(str1, 0));
6720     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6721     jcc(Assembler::below, COMPARE_INDEX_CHAR);
6722     subptr(cnt2, stride);
6723     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6724     lea(str1, Address(str1, result, scale));
6725     lea(str2, Address(str2, result, scale));
6726     negptr(cnt2);
6727     jmpb(WHILE_HEAD_LABEL);
6728 


6947   // Compare trailing char (final 2 bytes), if any
6948   bind(COMPARE_CHAR);
6949   testl(result, 0x2);   // tail  char
6950   jccb(Assembler::zero, TRUE_LABEL);
6951   load_unsigned_short(chr, Address(ary1, 0));
6952   load_unsigned_short(limit, Address(ary2, 0));
6953   cmpl(chr, limit);
6954   jccb(Assembler::notEqual, FALSE_LABEL);
6955 
6956   bind(TRUE_LABEL);
6957   movl(result, 1);   // return true
6958   jmpb(DONE);
6959 
6960   bind(FALSE_LABEL);
6961   xorl(result, result); // return false
6962 
6963   // That's it
6964   bind(DONE);
6965   if (UseAVX >= 2) {
6966     // clean upper bits of YMM registers
6967     vzeroupper();

6968   }
6969 }
6970 
6971 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6972                                    Register to, Register value, Register count,
6973                                    Register rtmp, XMMRegister xtmp) {
6974   ShortBranchVerifier sbv(this);
6975   assert_different_registers(to, value, count, rtmp);
6976   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6977   Label L_fill_2_bytes, L_fill_4_bytes;
6978 
6979   int shift = -1;
6980   switch (t) {
6981     case T_BYTE:
6982       shift = 2;
6983       break;
6984     case T_SHORT:
6985       shift = 1;
6986       break;
6987     case T_INT:


7081         subl(count, 16 << shift);
7082         jcc(Assembler::less, L_check_fill_32_bytes);
7083         align(16);
7084 
7085         BIND(L_fill_64_bytes_loop);
7086         vmovdqu(Address(to, 0), xtmp);
7087         vmovdqu(Address(to, 32), xtmp);
7088         addptr(to, 64);
7089         subl(count, 16 << shift);
7090         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7091 
7092         BIND(L_check_fill_32_bytes);
7093         addl(count, 8 << shift);
7094         jccb(Assembler::less, L_check_fill_8_bytes);
7095         vmovdqu(Address(to, 0), xtmp);
7096         addptr(to, 32);
7097         subl(count, 8 << shift);
7098 
7099         BIND(L_check_fill_8_bytes);
7100         // clean upper bits of YMM registers
7101         vzeroupper();

7102       } else {
7103         // Fill 32-byte chunks
7104         pshufd(xtmp, xtmp, 0);
7105 
7106         subl(count, 8 << shift);
7107         jcc(Assembler::less, L_check_fill_8_bytes);
7108         align(16);
7109 
7110         BIND(L_fill_32_bytes_loop);
7111 
7112         if (UseUnalignedLoadStores) {
7113           movdqu(Address(to, 0), xtmp);
7114           movdqu(Address(to, 16), xtmp);
7115         } else {
7116           movq(Address(to, 0), xtmp);
7117           movq(Address(to, 8), xtmp);
7118           movq(Address(to, 16), xtmp);
7119           movq(Address(to, 24), xtmp);
7120         }
7121 


7244         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7245       } else {
7246         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7247         por(tmp2Reg, tmp3Reg);
7248         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7249         por(tmp2Reg, tmp4Reg);
7250       }
7251       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7252       jccb(Assembler::notZero, L_copy_16_chars_exit);
7253       packuswb(tmp3Reg, tmp4Reg);
7254     }
7255     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7256 
7257     bind(L_chars_16_check);
7258     addptr(len, 16);
7259     jccb(Assembler::lessEqual, L_copy_16_chars);
7260 
7261     bind(L_copy_16_chars_exit);
7262     if (UseAVX >= 2) {
7263       // clean upper bits of YMM registers
7264       vzeroupper();




7265     }
7266     subptr(len, 8);
7267     jccb(Assembler::greater, L_copy_8_chars_exit);
7268 
7269     bind(L_copy_8_chars);
7270     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7271     ptest(tmp3Reg, tmp1Reg);
7272     jccb(Assembler::notZero, L_copy_8_chars_exit);
7273     packuswb(tmp3Reg, tmp1Reg);
7274     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7275     addptr(len, 8);
7276     jccb(Assembler::lessEqual, L_copy_8_chars);
7277 
7278     bind(L_copy_8_chars_exit);
7279     subptr(len, 8);
7280     jccb(Assembler::zero, L_done);
7281   }
7282 
7283   bind(L_copy_1_char);
7284   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));




6673 
6674     // Setup the registers to start vector comparison loop
6675     bind(COMPARE_WIDE_VECTORS);
6676     lea(str1, Address(str1, result, scale));
6677     lea(str2, Address(str2, result, scale));
6678     subl(result, stride2);
6679     subl(cnt2, stride2);
6680     jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6681     negptr(result);
6682 
6683     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6684     bind(COMPARE_WIDE_VECTORS_LOOP);
6685     vmovdqu(vec1, Address(str1, result, scale));
6686     vpxor(vec1, Address(str2, result, scale));
6687     vptest(vec1, vec1);
6688     jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6689     addptr(result, stride2);
6690     subl(cnt2, stride2);
6691     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6692     // clean upper bits of YMM registers
6693     vpxor(vec1, vec1);
6694 
6695     // compare wide vectors tail
6696     bind(COMPARE_WIDE_TAIL);
6697     testptr(result, result);
6698     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6699 
6700     movl(result, stride2);
6701     movl(cnt2, result);
6702     negptr(result);
6703     jmpb(COMPARE_WIDE_VECTORS_LOOP);
6704 
6705     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6706     bind(VECTOR_NOT_EQUAL);
6707     // clean upper bits of YMM registers
6708     vpxor(vec1, vec1);
6709     lea(str1, Address(str1, result, scale));
6710     lea(str2, Address(str2, result, scale));
6711     jmp(COMPARE_16_CHARS);
6712 
6713     // Compare tail chars, length between 1 to 15 chars
6714     bind(COMPARE_TAIL_LONG);
6715     movl(cnt2, result);
6716     cmpl(cnt2, stride);
6717     jccb(Assembler::less, COMPARE_SMALL_STR);
6718 
6719     movdqu(vec1, Address(str1, 0));
6720     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6721     jcc(Assembler::below, COMPARE_INDEX_CHAR);
6722     subptr(cnt2, stride);
6723     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6724     lea(str1, Address(str1, result, scale));
6725     lea(str2, Address(str2, result, scale));
6726     negptr(cnt2);
6727     jmpb(WHILE_HEAD_LABEL);
6728 


6947   // Compare trailing char (final 2 bytes), if any
6948   bind(COMPARE_CHAR);
6949   testl(result, 0x2);   // tail  char
6950   jccb(Assembler::zero, TRUE_LABEL);
6951   load_unsigned_short(chr, Address(ary1, 0));
6952   load_unsigned_short(limit, Address(ary2, 0));
6953   cmpl(chr, limit);
6954   jccb(Assembler::notEqual, FALSE_LABEL);
6955 
6956   bind(TRUE_LABEL);
6957   movl(result, 1);   // return true
6958   jmpb(DONE);
6959 
6960   bind(FALSE_LABEL);
6961   xorl(result, result); // return false
6962 
6963   // That's it
6964   bind(DONE);
6965   if (UseAVX >= 2) {
6966     // clean upper bits of YMM registers
6967     vpxor(vec1, vec1);
6968     vpxor(vec2, vec2);
6969   }
6970 }
6971 
6972 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6973                                    Register to, Register value, Register count,
6974                                    Register rtmp, XMMRegister xtmp) {
6975   ShortBranchVerifier sbv(this);
6976   assert_different_registers(to, value, count, rtmp);
6977   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6978   Label L_fill_2_bytes, L_fill_4_bytes;
6979 
6980   int shift = -1;
6981   switch (t) {
6982     case T_BYTE:
6983       shift = 2;
6984       break;
6985     case T_SHORT:
6986       shift = 1;
6987       break;
6988     case T_INT:


7082         subl(count, 16 << shift);
7083         jcc(Assembler::less, L_check_fill_32_bytes);
7084         align(16);
7085 
7086         BIND(L_fill_64_bytes_loop);
7087         vmovdqu(Address(to, 0), xtmp);
7088         vmovdqu(Address(to, 32), xtmp);
7089         addptr(to, 64);
7090         subl(count, 16 << shift);
7091         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7092 
7093         BIND(L_check_fill_32_bytes);
7094         addl(count, 8 << shift);
7095         jccb(Assembler::less, L_check_fill_8_bytes);
7096         vmovdqu(Address(to, 0), xtmp);
7097         addptr(to, 32);
7098         subl(count, 8 << shift);
7099 
7100         BIND(L_check_fill_8_bytes);
7101         // clean upper bits of YMM registers
7102         movdl(xtmp, value);
7103         pshufd(xtmp, xtmp, 0);
7104       } else {
7105         // Fill 32-byte chunks
7106         pshufd(xtmp, xtmp, 0);
7107 
7108         subl(count, 8 << shift);
7109         jcc(Assembler::less, L_check_fill_8_bytes);
7110         align(16);
7111 
7112         BIND(L_fill_32_bytes_loop);
7113 
7114         if (UseUnalignedLoadStores) {
7115           movdqu(Address(to, 0), xtmp);
7116           movdqu(Address(to, 16), xtmp);
7117         } else {
7118           movq(Address(to, 0), xtmp);
7119           movq(Address(to, 8), xtmp);
7120           movq(Address(to, 16), xtmp);
7121           movq(Address(to, 24), xtmp);
7122         }
7123 


7246         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7247       } else {
7248         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7249         por(tmp2Reg, tmp3Reg);
7250         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7251         por(tmp2Reg, tmp4Reg);
7252       }
7253       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7254       jccb(Assembler::notZero, L_copy_16_chars_exit);
7255       packuswb(tmp3Reg, tmp4Reg);
7256     }
7257     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7258 
7259     bind(L_chars_16_check);
7260     addptr(len, 16);
7261     jccb(Assembler::lessEqual, L_copy_16_chars);
7262 
7263     bind(L_copy_16_chars_exit);
7264     if (UseAVX >= 2) {
7265       // clean upper bits of YMM registers
7266       vpxor(tmp2Reg, tmp2Reg);
7267       vpxor(tmp3Reg, tmp3Reg);
7268       vpxor(tmp4Reg, tmp4Reg);
7269       movdl(tmp1Reg, tmp5);
7270       pshufd(tmp1Reg, tmp1Reg, 0);
7271     }
7272     subptr(len, 8);
7273     jccb(Assembler::greater, L_copy_8_chars_exit);
7274 
7275     bind(L_copy_8_chars);
7276     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7277     ptest(tmp3Reg, tmp1Reg);
7278     jccb(Assembler::notZero, L_copy_8_chars_exit);
7279     packuswb(tmp3Reg, tmp1Reg);
7280     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7281     addptr(len, 8);
7282     jccb(Assembler::lessEqual, L_copy_8_chars);
7283 
7284     bind(L_copy_8_chars_exit);
7285     subptr(len, 8);
7286     jccb(Assembler::zero, L_done);
7287   }
7288 
7289   bind(L_copy_1_char);
7290   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));


< prev index next >