6673
6674 // Setup the registers to start vector comparison loop
6675 bind(COMPARE_WIDE_VECTORS);
6676 lea(str1, Address(str1, result, scale));
6677 lea(str2, Address(str2, result, scale));
6678 subl(result, stride2);
6679 subl(cnt2, stride2);
6680 jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6681 negptr(result);
6682
6683 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6684 bind(COMPARE_WIDE_VECTORS_LOOP);
6685 vmovdqu(vec1, Address(str1, result, scale));
6686 vpxor(vec1, Address(str2, result, scale));
6687 vptest(vec1, vec1);
6688 jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6689 addptr(result, stride2);
6690 subl(cnt2, stride2);
6691 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6692 // clean upper bits of YMM registers
6693 vzeroupper();
6694
6695 // compare wide vectors tail
6696 bind(COMPARE_WIDE_TAIL);
6697 testptr(result, result);
6698 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6699
6700 movl(result, stride2);
6701 movl(cnt2, result);
6702 negptr(result);
6703 jmpb(COMPARE_WIDE_VECTORS_LOOP);
6704
6705 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6706 bind(VECTOR_NOT_EQUAL);
6707 // clean upper bits of YMM registers
6708 vzeroupper();
6709 lea(str1, Address(str1, result, scale));
6710 lea(str2, Address(str2, result, scale));
6711 jmp(COMPARE_16_CHARS);
6712
6713 // Compare tail chars, length between 1 to 15 chars
6714 bind(COMPARE_TAIL_LONG);
6715 movl(cnt2, result);
6716 cmpl(cnt2, stride);
6717 jccb(Assembler::less, COMPARE_SMALL_STR);
6718
6719 movdqu(vec1, Address(str1, 0));
6720 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6721 jcc(Assembler::below, COMPARE_INDEX_CHAR);
6722 subptr(cnt2, stride);
6723 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6724 lea(str1, Address(str1, result, scale));
6725 lea(str2, Address(str2, result, scale));
6726 negptr(cnt2);
6727 jmpb(WHILE_HEAD_LABEL);
6728
6947 // Compare trailing char (final 2 bytes), if any
6948 bind(COMPARE_CHAR);
6949 testl(result, 0x2); // tail char
6950 jccb(Assembler::zero, TRUE_LABEL);
6951 load_unsigned_short(chr, Address(ary1, 0));
6952 load_unsigned_short(limit, Address(ary2, 0));
6953 cmpl(chr, limit);
6954 jccb(Assembler::notEqual, FALSE_LABEL);
6955
6956 bind(TRUE_LABEL);
6957 movl(result, 1); // return true
6958 jmpb(DONE);
6959
6960 bind(FALSE_LABEL);
6961 xorl(result, result); // return false
6962
6963 // That's it
6964 bind(DONE);
6965 if (UseAVX >= 2) {
6966 // clean upper bits of YMM registers
6967 vzeroupper();
6968 }
6969 }
6970
6971 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6972 Register to, Register value, Register count,
6973 Register rtmp, XMMRegister xtmp) {
6974 ShortBranchVerifier sbv(this);
6975 assert_different_registers(to, value, count, rtmp);
6976 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6977 Label L_fill_2_bytes, L_fill_4_bytes;
6978
6979 int shift = -1;
6980 switch (t) {
6981 case T_BYTE:
6982 shift = 2;
6983 break;
6984 case T_SHORT:
6985 shift = 1;
6986 break;
6987 case T_INT:
7081 subl(count, 16 << shift);
7082 jcc(Assembler::less, L_check_fill_32_bytes);
7083 align(16);
7084
7085 BIND(L_fill_64_bytes_loop);
7086 vmovdqu(Address(to, 0), xtmp);
7087 vmovdqu(Address(to, 32), xtmp);
7088 addptr(to, 64);
7089 subl(count, 16 << shift);
7090 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7091
7092 BIND(L_check_fill_32_bytes);
7093 addl(count, 8 << shift);
7094 jccb(Assembler::less, L_check_fill_8_bytes);
7095 vmovdqu(Address(to, 0), xtmp);
7096 addptr(to, 32);
7097 subl(count, 8 << shift);
7098
7099 BIND(L_check_fill_8_bytes);
7100 // clean upper bits of YMM registers
7101 vzeroupper();
7102 } else {
7103 // Fill 32-byte chunks
7104 pshufd(xtmp, xtmp, 0);
7105
7106 subl(count, 8 << shift);
7107 jcc(Assembler::less, L_check_fill_8_bytes);
7108 align(16);
7109
7110 BIND(L_fill_32_bytes_loop);
7111
7112 if (UseUnalignedLoadStores) {
7113 movdqu(Address(to, 0), xtmp);
7114 movdqu(Address(to, 16), xtmp);
7115 } else {
7116 movq(Address(to, 0), xtmp);
7117 movq(Address(to, 8), xtmp);
7118 movq(Address(to, 16), xtmp);
7119 movq(Address(to, 24), xtmp);
7120 }
7121
7244 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7245 } else {
7246 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7247 por(tmp2Reg, tmp3Reg);
7248 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7249 por(tmp2Reg, tmp4Reg);
7250 }
7251 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7252 jccb(Assembler::notZero, L_copy_16_chars_exit);
7253 packuswb(tmp3Reg, tmp4Reg);
7254 }
7255 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7256
7257 bind(L_chars_16_check);
7258 addptr(len, 16);
7259 jccb(Assembler::lessEqual, L_copy_16_chars);
7260
7261 bind(L_copy_16_chars_exit);
7262 if (UseAVX >= 2) {
7263 // clean upper bits of YMM registers
7264 vzeroupper();
7265 }
7266 subptr(len, 8);
7267 jccb(Assembler::greater, L_copy_8_chars_exit);
7268
7269 bind(L_copy_8_chars);
7270 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7271 ptest(tmp3Reg, tmp1Reg);
7272 jccb(Assembler::notZero, L_copy_8_chars_exit);
7273 packuswb(tmp3Reg, tmp1Reg);
7274 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7275 addptr(len, 8);
7276 jccb(Assembler::lessEqual, L_copy_8_chars);
7277
7278 bind(L_copy_8_chars_exit);
7279 subptr(len, 8);
7280 jccb(Assembler::zero, L_done);
7281 }
7282
7283 bind(L_copy_1_char);
7284 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
|
6673
6674 // Setup the registers to start vector comparison loop
6675 bind(COMPARE_WIDE_VECTORS);
6676 lea(str1, Address(str1, result, scale));
6677 lea(str2, Address(str2, result, scale));
6678 subl(result, stride2);
6679 subl(cnt2, stride2);
6680 jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6681 negptr(result);
6682
6683 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6684 bind(COMPARE_WIDE_VECTORS_LOOP);
6685 vmovdqu(vec1, Address(str1, result, scale));
6686 vpxor(vec1, Address(str2, result, scale));
6687 vptest(vec1, vec1);
6688 jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6689 addptr(result, stride2);
6690 subl(cnt2, stride2);
6691 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6692 // clean upper bits of YMM registers
6693 vpxor(vec1, vec1);
6694
6695 // compare wide vectors tail
6696 bind(COMPARE_WIDE_TAIL);
6697 testptr(result, result);
6698 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6699
6700 movl(result, stride2);
6701 movl(cnt2, result);
6702 negptr(result);
6703 jmpb(COMPARE_WIDE_VECTORS_LOOP);
6704
6705 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6706 bind(VECTOR_NOT_EQUAL);
6707 // clean upper bits of YMM registers
6708 vpxor(vec1, vec1);
6709 lea(str1, Address(str1, result, scale));
6710 lea(str2, Address(str2, result, scale));
6711 jmp(COMPARE_16_CHARS);
6712
6713 // Compare tail chars, length between 1 to 15 chars
6714 bind(COMPARE_TAIL_LONG);
6715 movl(cnt2, result);
6716 cmpl(cnt2, stride);
6717 jccb(Assembler::less, COMPARE_SMALL_STR);
6718
6719 movdqu(vec1, Address(str1, 0));
6720 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6721 jcc(Assembler::below, COMPARE_INDEX_CHAR);
6722 subptr(cnt2, stride);
6723 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6724 lea(str1, Address(str1, result, scale));
6725 lea(str2, Address(str2, result, scale));
6726 negptr(cnt2);
6727 jmpb(WHILE_HEAD_LABEL);
6728
6947 // Compare trailing char (final 2 bytes), if any
6948 bind(COMPARE_CHAR);
6949 testl(result, 0x2); // tail char
6950 jccb(Assembler::zero, TRUE_LABEL);
6951 load_unsigned_short(chr, Address(ary1, 0));
6952 load_unsigned_short(limit, Address(ary2, 0));
6953 cmpl(chr, limit);
6954 jccb(Assembler::notEqual, FALSE_LABEL);
6955
6956 bind(TRUE_LABEL);
6957 movl(result, 1); // return true
6958 jmpb(DONE);
6959
6960 bind(FALSE_LABEL);
6961 xorl(result, result); // return false
6962
6963 // That's it
6964 bind(DONE);
6965 if (UseAVX >= 2) {
6966 // clean upper bits of YMM registers
6967 vpxor(vec1, vec1);
6968 vpxor(vec2, vec2);
6969 }
6970 }
6971
6972 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6973 Register to, Register value, Register count,
6974 Register rtmp, XMMRegister xtmp) {
6975 ShortBranchVerifier sbv(this);
6976 assert_different_registers(to, value, count, rtmp);
6977 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6978 Label L_fill_2_bytes, L_fill_4_bytes;
6979
6980 int shift = -1;
6981 switch (t) {
6982 case T_BYTE:
6983 shift = 2;
6984 break;
6985 case T_SHORT:
6986 shift = 1;
6987 break;
6988 case T_INT:
7082 subl(count, 16 << shift);
7083 jcc(Assembler::less, L_check_fill_32_bytes);
7084 align(16);
7085
7086 BIND(L_fill_64_bytes_loop);
7087 vmovdqu(Address(to, 0), xtmp);
7088 vmovdqu(Address(to, 32), xtmp);
7089 addptr(to, 64);
7090 subl(count, 16 << shift);
7091 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7092
7093 BIND(L_check_fill_32_bytes);
7094 addl(count, 8 << shift);
7095 jccb(Assembler::less, L_check_fill_8_bytes);
7096 vmovdqu(Address(to, 0), xtmp);
7097 addptr(to, 32);
7098 subl(count, 8 << shift);
7099
7100 BIND(L_check_fill_8_bytes);
7101 // clean upper bits of YMM registers
7102 movdl(xtmp, value);
7103 pshufd(xtmp, xtmp, 0);
7104 } else {
7105 // Fill 32-byte chunks
7106 pshufd(xtmp, xtmp, 0);
7107
7108 subl(count, 8 << shift);
7109 jcc(Assembler::less, L_check_fill_8_bytes);
7110 align(16);
7111
7112 BIND(L_fill_32_bytes_loop);
7113
7114 if (UseUnalignedLoadStores) {
7115 movdqu(Address(to, 0), xtmp);
7116 movdqu(Address(to, 16), xtmp);
7117 } else {
7118 movq(Address(to, 0), xtmp);
7119 movq(Address(to, 8), xtmp);
7120 movq(Address(to, 16), xtmp);
7121 movq(Address(to, 24), xtmp);
7122 }
7123
7246 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7247 } else {
7248 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7249 por(tmp2Reg, tmp3Reg);
7250 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7251 por(tmp2Reg, tmp4Reg);
7252 }
7253 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7254 jccb(Assembler::notZero, L_copy_16_chars_exit);
7255 packuswb(tmp3Reg, tmp4Reg);
7256 }
7257 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7258
7259 bind(L_chars_16_check);
7260 addptr(len, 16);
7261 jccb(Assembler::lessEqual, L_copy_16_chars);
7262
7263 bind(L_copy_16_chars_exit);
7264 if (UseAVX >= 2) {
7265 // clean upper bits of YMM registers
7266 vpxor(tmp2Reg, tmp2Reg);
7267 vpxor(tmp3Reg, tmp3Reg);
7268 vpxor(tmp4Reg, tmp4Reg);
7269 movdl(tmp1Reg, tmp5);
7270 pshufd(tmp1Reg, tmp1Reg, 0);
7271 }
7272 subptr(len, 8);
7273 jccb(Assembler::greater, L_copy_8_chars_exit);
7274
7275 bind(L_copy_8_chars);
7276 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7277 ptest(tmp3Reg, tmp1Reg);
7278 jccb(Assembler::notZero, L_copy_8_chars_exit);
7279 packuswb(tmp3Reg, tmp1Reg);
7280 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7281 addptr(len, 8);
7282 jccb(Assembler::lessEqual, L_copy_8_chars);
7283
7284 bind(L_copy_8_chars_exit);
7285 subptr(len, 8);
7286 jccb(Assembler::zero, L_done);
7287 }
7288
7289 bind(L_copy_1_char);
7290 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
|