6958 xorptr(tmp, tmp);
6959 if (UseFastStosb) {
6960 shlptr(cnt,3); // convert to number of bytes
6961 rep_stosb();
6962 } else {
6963 NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
6964 rep_stos();
6965 }
6966 }
6967
6968 #ifdef COMPILER2
6969
6970 // IndexOf for constant substrings with size >= 8 chars
6971 // which don't need to be loaded through stack.
6972 void MacroAssembler::string_indexofC8(Register str1, Register str2,
6973 Register cnt1, Register cnt2,
6974 int int_cnt2, Register result,
6975 XMMRegister vec, Register tmp,
6976 int ae) {
6977 ShortBranchVerifier sbv(this);
6978 assert(UseSSE42Intrinsics, "SSE4.2 is required");
6979 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6980
6981 // This method uses the pcmpestri instruction with bound registers
6982 // inputs:
6983 // xmm - substring
6984 // rax - substring length (elements count)
6985 // mem - scanned string
6986 // rdx - string length (elements count)
6987 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6988 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6989 // outputs:
6990 // rcx - matched index in string
6991 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6992 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6993 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6994 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6995 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6996
6997 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6998 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
7136 } // (int_cnt2 > 8)
7137
7138 bind(RET_FOUND);
7139 // Found result if we matched full small substring.
7140 // Compute substr offset
7141 subptr(result, str1);
7142 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7143 shrl(result, 1); // index
7144 }
7145 bind(EXIT);
7146
7147 } // string_indexofC8
7148
7149 // Small strings are loaded through stack if they cross page boundary.
7150 void MacroAssembler::string_indexof(Register str1, Register str2,
7151 Register cnt1, Register cnt2,
7152 int int_cnt2, Register result,
7153 XMMRegister vec, Register tmp,
7154 int ae) {
7155 ShortBranchVerifier sbv(this);
7156 assert(UseSSE42Intrinsics, "SSE4.2 is required");
7157 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7158
7159 //
7160 // int_cnt2 is length of small (< 8 chars) constant substring
7161 // or (-1) for non constant substring in which case its length
7162 // is in cnt2 register.
7163 //
7164 // Note, inline_string_indexOf() generates checks:
7165 // if (substr.count > string.count) return -1;
7166 // if (substr.count == 0) return 0;
7167 //
7168 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7169 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
7170 // This method uses the pcmpestri instruction with bound registers
7171 // inputs:
7172 // xmm - substring
7173 // rax - substring length (elements count)
7174 // mem - scanned string
7175 // rdx - string length (elements count)
7176 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7453 jmpb(SCAN_SUBSTR);
7454
7455 bind(RET_FOUND_LONG);
7456 movptr(str1, Address(rsp, wordSize));
7457 } // non constant
7458
7459 bind(RET_FOUND);
7460 // Compute substr offset
7461 subptr(result, str1);
7462 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7463 shrl(result, 1); // index
7464 }
7465 bind(CLEANUP);
7466 pop(rsp); // restore SP
7467
7468 } // string_indexof
7469
7470 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7471 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7472 ShortBranchVerifier sbv(this);
7473 assert(UseSSE42Intrinsics, "SSE4.2 is required");
7474
7475 int stride = 8;
7476
7477 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7478 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7479 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7480 FOUND_SEQ_CHAR, DONE_LABEL;
7481
7482 movptr(result, str1);
7483 if (UseAVX >= 2) {
7484 cmpl(cnt1, stride);
7485 jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7486 cmpl(cnt1, 2*stride);
7487 jccb(Assembler::less, SCAN_TO_8_CHAR_INIT);
7488 movdl(vec1, ch);
7489 vpbroadcastw(vec1, vec1);
7490 vpxor(vec2, vec2);
7491 movl(tmp, cnt1);
7492 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
7493 andl(cnt1,0x0000000F); //tail count (in chars)
7494
7495 bind(SCAN_TO_16_CHAR_LOOP);
7496 vmovdqu(vec3, Address(result, 0));
7497 vpcmpeqw(vec3, vec3, vec1, 1);
7498 vptest(vec2, vec3);
7499 jcc(Assembler::carryClear, FOUND_CHAR);
7500 addptr(result, 32);
7501 subl(tmp, 2*stride);
7502 jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7503 jmp(SCAN_TO_8_CHAR);
7504 bind(SCAN_TO_8_CHAR_INIT);
7505 movdl(vec1, ch);
7506 pshuflw(vec1, vec1, 0x00);
7507 pshufd(vec1, vec1, 0);
7508 pxor(vec2, vec2);
7509 }
7510 if (UseAVX >= 2 || UseSSE42Intrinsics) {
7511 bind(SCAN_TO_8_CHAR);
7512 cmpl(cnt1, stride);
7513 if (UseAVX >= 2) {
7514 jccb(Assembler::less, SCAN_TO_CHAR);
7515 }
7516 if (!(UseAVX >= 2)) {
7517 jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7518 movdl(vec1, ch);
7519 pshuflw(vec1, vec1, 0x00);
7520 pshufd(vec1, vec1, 0);
7521 pxor(vec2, vec2);
7522 }
7523 movl(tmp, cnt1);
7524 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
7525 andl(cnt1,0x00000007); //tail count (in chars)
7526
7527 bind(SCAN_TO_8_CHAR_LOOP);
7528 movdqu(vec3, Address(result, 0));
7529 pcmpeqw(vec3, vec1);
7530 ptest(vec2, vec3);
7531 jcc(Assembler::carryClear, FOUND_CHAR);
7532 addptr(result, 16);
7533 subl(tmp, stride);
7534 jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
7535 }
7536 bind(SCAN_TO_CHAR);
7537 testl(cnt1, cnt1);
7538 jcc(Assembler::zero, RET_NOT_FOUND);
7539
7540 bind(SCAN_TO_CHAR_LOOP);
7541 load_unsigned_short(tmp, Address(result, 0));
7542 cmpl(ch, tmp);
7543 jccb(Assembler::equal, FOUND_SEQ_CHAR);
7544 addptr(result, 2);
7545 subl(cnt1, 1);
7546 jccb(Assembler::zero, RET_NOT_FOUND);
7547 jmp(SCAN_TO_CHAR_LOOP);
7548
7549 bind(RET_NOT_FOUND);
7550 movl(result, -1);
7551 jmpb(DONE_LABEL);
7552
7553 if (UseAVX >= 2 || UseSSE42Intrinsics) {
7554 bind(FOUND_CHAR);
7555 if (UseAVX >= 2) {
7556 vpmovmskb(tmp, vec3);
7557 } else {
7558 pmovmskb(tmp, vec3);
7559 }
7560 bsfl(ch, tmp);
7561 addl(result, ch);
7562 }
7563
7564 bind(FOUND_SEQ_CHAR);
7565 subptr(result, str1);
7566 shrl(result, 1);
7567
7568 bind(DONE_LABEL);
7569 } // string_indexof_char
7570
7571 // helper function for string_compare
7572 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
7573 Address::ScaleFactor scale, Address::ScaleFactor scale1,
7574 Address::ScaleFactor scale2, Register index, int ae) {
7575 if (ae == StrIntrinsicNode::LL) {
7576 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
7577 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
7578 } else if (ae == StrIntrinsicNode::UU) {
7579 load_unsigned_short(elem1, Address(str1, index, scale, 0));
7580 load_unsigned_short(elem2, Address(str2, index, scale, 0));
7581 } else {
7582 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
7631
7632 // Check if the strings start at the same location and setup scale and stride
7633 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7634 cmpptr(str1, str2);
7635 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7636 if (ae == StrIntrinsicNode::LL) {
7637 scale = Address::times_1;
7638 stride = 16;
7639 } else {
7640 scale = Address::times_2;
7641 stride = 8;
7642 }
7643 } else {
7644 scale = Address::no_scale; // not used
7645 scale1 = Address::times_1;
7646 scale2 = Address::times_2;
7647 stride = 8;
7648 }
7649
7650 if (UseAVX >= 2 && UseSSE42Intrinsics) {
7651 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
7652 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
7653 Label COMPARE_TAIL_LONG;
7654 int pcmpmask = 0x19;
7655 if (ae == StrIntrinsicNode::LL) {
7656 pcmpmask &= ~0x01;
7657 }
7658
7659 // Setup to compare 16-chars (32-bytes) vectors,
7660 // start from first character again because it has aligned address.
7661 if (ae == StrIntrinsicNode::LL) {
7662 stride2 = 32;
7663 } else {
7664 stride2 = 16;
7665 }
7666 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7667 adr_stride = stride << scale;
7668 } else {
7669 adr_stride1 = 8; //stride << scale1;
7670 adr_stride2 = 16; //stride << scale2;
7766 movdqu(vec1, Address(str1, 0));
7767 } else {
7768 pmovzxbw(vec1, Address(str1, 0));
7769 }
7770 pcmpestri(vec1, Address(str2, 0), pcmpmask);
7771 jcc(Assembler::below, COMPARE_INDEX_CHAR);
7772 subptr(cnt2, stride);
7773 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
7774 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7775 lea(str1, Address(str1, result, scale));
7776 lea(str2, Address(str2, result, scale));
7777 } else {
7778 lea(str1, Address(str1, result, scale1));
7779 lea(str2, Address(str2, result, scale2));
7780 }
7781 negptr(cnt2);
7782 jmpb(WHILE_HEAD_LABEL);
7783
7784 bind(COMPARE_SMALL_STR);
7785 } else if (UseSSE42Intrinsics) {
7786 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
7787 int pcmpmask = 0x19;
7788 // Setup to compare 8-char (16-byte) vectors,
7789 // start from first character again because it has aligned address.
7790 movl(result, cnt2);
7791 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
7792 if (ae == StrIntrinsicNode::LL) {
7793 pcmpmask &= ~0x01;
7794 }
7795 jccb(Assembler::zero, COMPARE_TAIL);
7796 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7797 lea(str1, Address(str1, result, scale));
7798 lea(str2, Address(str2, result, scale));
7799 } else {
7800 lea(str1, Address(str1, result, scale1));
7801 lea(str2, Address(str2, result, scale2));
7802 }
7803 negptr(result);
7804
7805 // pcmpestri
7898 // Search for Non-ASCII character (Negative byte value) in a byte array,
7899 // return true if it has any and false otherwise.
7900 void MacroAssembler::has_negatives(Register ary1, Register len,
7901 Register result, Register tmp1,
7902 XMMRegister vec1, XMMRegister vec2) {
7903
7904 // rsi: byte array
7905 // rcx: len
7906 // rax: result
7907 ShortBranchVerifier sbv(this);
7908 assert_different_registers(ary1, len, result, tmp1);
7909 assert_different_registers(vec1, vec2);
7910 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
7911
7912 // len == 0
7913 testl(len, len);
7914 jcc(Assembler::zero, FALSE_LABEL);
7915
7916 movl(result, len); // copy
7917
7918 if (UseAVX >= 2) {
7919 // With AVX2, use 32-byte vector compare
7920 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7921
7922 // Compare 32-byte vectors
7923 andl(result, 0x0000001f); // tail count (in bytes)
7924 andl(len, 0xffffffe0); // vector count (in bytes)
7925 jccb(Assembler::zero, COMPARE_TAIL);
7926
7927 lea(ary1, Address(ary1, len, Address::times_1));
7928 negptr(len);
7929
7930 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
7931 movdl(vec2, tmp1);
7932 vpbroadcastd(vec2, vec2);
7933
7934 bind(COMPARE_WIDE_VECTORS);
7935 vmovdqu(vec1, Address(ary1, len, Address::times_1));
7936 vptest(vec1, vec2);
7937 jccb(Assembler::notZero, TRUE_LABEL);
7938 addptr(len, 32);
7939 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7940
7941 testl(result, result);
7942 jccb(Assembler::zero, FALSE_LABEL);
7943
7944 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7945 vptest(vec1, vec2);
7946 jccb(Assembler::notZero, TRUE_LABEL);
7947 jmpb(FALSE_LABEL);
7948
7949 bind(COMPARE_TAIL); // len is zero
7950 movl(len, result);
7951 // Fallthru to tail compare
7952 } else if (UseSSE42Intrinsics) {
7953 // With SSE4.2, use double quad vector compare
7954 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7955
7956 // Compare 16-byte vectors
7957 andl(result, 0x0000000f); // tail count (in bytes)
7958 andl(len, 0xfffffff0); // vector count (in bytes)
7959 jccb(Assembler::zero, COMPARE_TAIL);
7960
7961 lea(ary1, Address(ary1, len, Address::times_1));
7962 negptr(len);
7963
7964 movl(tmp1, 0x80808080);
7965 movdl(vec2, tmp1);
7966 pshufd(vec2, vec2, 0);
7967
7968 bind(COMPARE_WIDE_VECTORS);
7969 movdqu(vec1, Address(ary1, len, Address::times_1));
7970 ptest(vec1, vec2);
7971 jccb(Assembler::notZero, TRUE_LABEL);
7972 addptr(len, 16);
8009 subptr(result, 2);
8010 lea(ary1, Address(ary1, 2));
8011
8012 bind(COMPARE_BYTE);
8013 testl(result, 0x1); // tail byte
8014 jccb(Assembler::zero, FALSE_LABEL);
8015 load_unsigned_byte(tmp1, Address(ary1, 0));
8016 andl(tmp1, 0x00000080);
8017 jccb(Assembler::notEqual, TRUE_LABEL);
8018 jmpb(FALSE_LABEL);
8019
8020 bind(TRUE_LABEL);
8021 movl(result, 1); // return true
8022 jmpb(DONE);
8023
8024 bind(FALSE_LABEL);
8025 xorl(result, result); // return false
8026
8027 // That's it
8028 bind(DONE);
8029 if (UseAVX >= 2) {
8030 // clean upper bits of YMM registers
8031 vpxor(vec1, vec1);
8032 vpxor(vec2, vec2);
8033 }
8034 }
8035
8036 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
8037 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
8038 Register limit, Register result, Register chr,
8039 XMMRegister vec1, XMMRegister vec2, bool is_char) {
8040 ShortBranchVerifier sbv(this);
8041 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
8042
8043 int length_offset = arrayOopDesc::length_offset_in_bytes();
8044 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
8045
8046 if (is_array_equ) {
8047 // Check the input args
8048 cmpptr(ary1, ary2);
8049 jcc(Assembler::equal, TRUE_LABEL);
8097 vptest(vec1, vec1);
8098 jccb(Assembler::notZero, FALSE_LABEL);
8099 addptr(limit, 32);
8100 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8101
8102 testl(result, result);
8103 jccb(Assembler::zero, TRUE_LABEL);
8104
8105 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8106 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
8107 vpxor(vec1, vec2);
8108
8109 vptest(vec1, vec1);
8110 jccb(Assembler::notZero, FALSE_LABEL);
8111 jmpb(TRUE_LABEL);
8112
8113 bind(COMPARE_TAIL); // limit is zero
8114 movl(limit, result);
8115 // Fallthru to tail compare
8116 } else if (UseSSE42Intrinsics) {
8117 // With SSE4.2, use double quad vector compare
8118 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8119
8120 // Compare 16-byte vectors
8121 andl(result, 0x0000000f); // tail count (in bytes)
8122 andl(limit, 0xfffffff0); // vector count (in bytes)
8123 jccb(Assembler::zero, COMPARE_TAIL);
8124
8125 lea(ary1, Address(ary1, limit, Address::times_1));
8126 lea(ary2, Address(ary2, limit, Address::times_1));
8127 negptr(limit);
8128
8129 bind(COMPARE_WIDE_VECTORS);
8130 movdqu(vec1, Address(ary1, limit, Address::times_1));
8131 movdqu(vec2, Address(ary2, limit, Address::times_1));
8132 pxor(vec1, vec2);
8133
8134 ptest(vec1, vec1);
8135 jccb(Assembler::notZero, FALSE_LABEL);
8136 addptr(limit, 16);
8446 // rdx: len
8447 // rcx: tmp5
8448 // rax: result
8449 ShortBranchVerifier sbv(this);
8450 assert_different_registers(src, dst, len, tmp5, result);
8451 Label L_done, L_copy_1_char, L_copy_1_char_exit;
8452
8453 // set result
8454 xorl(result, result);
8455 // check for zero length
8456 testl(len, len);
8457 jcc(Assembler::zero, L_done);
8458 movl(result, len);
8459
8460 // Setup pointers
8461 lea(src, Address(src, len, Address::times_2)); // char[]
8462 lea(dst, Address(dst, len, Address::times_1)); // byte[]
8463 negptr(len);
8464
8465 if (UseSSE42Intrinsics || UseAVX >= 2) {
8466 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8467 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8468
8469 if (UseAVX >= 2) {
8470 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8471 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
8472 movdl(tmp1Reg, tmp5);
8473 vpbroadcastd(tmp1Reg, tmp1Reg);
8474 jmpb(L_chars_32_check);
8475
8476 bind(L_copy_32_chars);
8477 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8478 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8479 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8480 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8481 jccb(Assembler::notZero, L_copy_32_chars_exit);
8482 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8483 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8484 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8485
10218 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10219 Register tmp5, Register result) {
10220 Label copy_chars_loop, return_length, return_zero, done;
10221
10222 // rsi: src
10223 // rdi: dst
10224 // rdx: len
10225 // rcx: tmp5
10226 // rax: result
10227
10228 // rsi holds start addr of source char[] to be compressed
10229 // rdi holds start addr of destination byte[]
10230 // rdx holds length
10231
10232 assert(len != result, "");
10233
10234 // save length for return
10235 push(len);
10236
10237 if (UseSSE42Intrinsics) {
10238 Label copy_32_loop, copy_16, copy_tail;
10239
10240 movl(result, len);
10241 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
10242
10243 // vectored compression
10244 andl(len, 0xfffffff0); // vector count (in chars)
10245 andl(result, 0x0000000f); // tail count (in chars)
10246 testl(len, len);
10247 jccb(Assembler::zero, copy_16);
10248
10249 // compress 16 chars per iter
10250 movdl(tmp1Reg, tmp5);
10251 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
10252 pxor(tmp4Reg, tmp4Reg);
10253
10254 lea(src, Address(src, len, Address::times_2));
10255 lea(dst, Address(dst, len, Address::times_1));
10256 negptr(len);
10257
10317
10318 bind(done);
10319 }
10320
10321 // Inflate byte[] array to char[].
10322 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10323 XMMRegister tmp1, Register tmp2) {
10324 Label copy_chars_loop, done;
10325
10326 // rsi: src
10327 // rdi: dst
10328 // rdx: len
10329 // rcx: tmp2
10330
10331 // rsi holds start addr of source byte[] to be inflated
10332 // rdi holds start addr of destination char[]
10333 // rdx holds length
10334 assert_different_registers(src, dst, len, tmp2);
10335
10336 if (UseSSE42Intrinsics) {
10337 Label copy_8_loop, copy_bytes, copy_tail;
10338
10339 movl(tmp2, len);
10340 andl(tmp2, 0x00000007); // tail count (in chars)
10341 andl(len, 0xfffffff8); // vector count (in chars)
10342 jccb(Assembler::zero, copy_tail);
10343
10344 // vectored inflation
10345 lea(src, Address(src, len, Address::times_1));
10346 lea(dst, Address(dst, len, Address::times_2));
10347 negptr(len);
10348
10349 // inflate 8 chars per iter
10350 bind(copy_8_loop);
10351 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
10352 movdqu(Address(dst, len, Address::times_2), tmp1);
10353 addptr(len, 8);
10354 jcc(Assembler::notZero, copy_8_loop);
10355
10356 bind(copy_tail);
|
6958 xorptr(tmp, tmp);
6959 if (UseFastStosb) {
6960 shlptr(cnt,3); // convert to number of bytes
6961 rep_stosb();
6962 } else {
6963 NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
6964 rep_stos();
6965 }
6966 }
6967
6968 #ifdef COMPILER2
6969
6970 // IndexOf for constant substrings with size >= 8 chars
6971 // which don't need to be loaded through stack.
6972 void MacroAssembler::string_indexofC8(Register str1, Register str2,
6973 Register cnt1, Register cnt2,
6974 int int_cnt2, Register result,
6975 XMMRegister vec, Register tmp,
6976 int ae) {
6977 ShortBranchVerifier sbv(this);
6978 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6979 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
6980 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6981
6982 // This method uses the pcmpestri instruction with bound registers
6983 // inputs:
6984 // xmm - substring
6985 // rax - substring length (elements count)
6986 // mem - scanned string
6987 // rdx - string length (elements count)
6988 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6989 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6990 // outputs:
6991 // rcx - matched index in string
6992 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6993 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6994 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6995 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6996 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6997
6998 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6999 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
7137 } // (int_cnt2 > 8)
7138
7139 bind(RET_FOUND);
7140 // Found result if we matched full small substring.
7141 // Compute substr offset
7142 subptr(result, str1);
7143 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7144 shrl(result, 1); // index
7145 }
7146 bind(EXIT);
7147
7148 } // string_indexofC8
7149
7150 // Small strings are loaded through stack if they cross page boundary.
7151 void MacroAssembler::string_indexof(Register str1, Register str2,
7152 Register cnt1, Register cnt2,
7153 int int_cnt2, Register result,
7154 XMMRegister vec, Register tmp,
7155 int ae) {
7156 ShortBranchVerifier sbv(this);
7157 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7158 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7159 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7160
7161 //
7162 // int_cnt2 is length of small (< 8 chars) constant substring
7163 // or (-1) for non constant substring in which case its length
7164 // is in cnt2 register.
7165 //
7166 // Note, inline_string_indexOf() generates checks:
7167 // if (substr.count > string.count) return -1;
7168 // if (substr.count == 0) return 0;
7169 //
7170 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7171 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
7172 // This method uses the pcmpestri instruction with bound registers
7173 // inputs:
7174 // xmm - substring
7175 // rax - substring length (elements count)
7176 // mem - scanned string
7177 // rdx - string length (elements count)
7178 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7455 jmpb(SCAN_SUBSTR);
7456
7457 bind(RET_FOUND_LONG);
7458 movptr(str1, Address(rsp, wordSize));
7459 } // non constant
7460
7461 bind(RET_FOUND);
7462 // Compute substr offset
7463 subptr(result, str1);
7464 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7465 shrl(result, 1); // index
7466 }
7467 bind(CLEANUP);
7468 pop(rsp); // restore SP
7469
7470 } // string_indexof
7471
7472 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7473 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7474 ShortBranchVerifier sbv(this);
7475 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7476 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7477
7478 int stride = 8;
7479
7480 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7481 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7482 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7483 FOUND_SEQ_CHAR, DONE_LABEL;
7484
7485 movptr(result, str1);
7486 if (UseAVX >= 2) {
7487 cmpl(cnt1, stride);
7488 jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7489 cmpl(cnt1, 2*stride);
7490 jccb(Assembler::less, SCAN_TO_8_CHAR_INIT);
7491 movdl(vec1, ch);
7492 vpbroadcastw(vec1, vec1);
7493 vpxor(vec2, vec2);
7494 movl(tmp, cnt1);
7495 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
7496 andl(cnt1,0x0000000F); //tail count (in chars)
7497
7498 bind(SCAN_TO_16_CHAR_LOOP);
7499 vmovdqu(vec3, Address(result, 0));
7500 vpcmpeqw(vec3, vec3, vec1, 1);
7501 vptest(vec2, vec3);
7502 jcc(Assembler::carryClear, FOUND_CHAR);
7503 addptr(result, 32);
7504 subl(tmp, 2*stride);
7505 jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7506 jmp(SCAN_TO_8_CHAR);
7507 bind(SCAN_TO_8_CHAR_INIT);
7508 movdl(vec1, ch);
7509 pshuflw(vec1, vec1, 0x00);
7510 pshufd(vec1, vec1, 0);
7511 pxor(vec2, vec2);
7512 }
7513 bind(SCAN_TO_8_CHAR);
7514 cmpl(cnt1, stride);
7515 if (UseAVX >= 2) {
7516 jccb(Assembler::less, SCAN_TO_CHAR);
7517 } else {
7518 jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7519 movdl(vec1, ch);
7520 pshuflw(vec1, vec1, 0x00);
7521 pshufd(vec1, vec1, 0);
7522 pxor(vec2, vec2);
7523 }
7524 movl(tmp, cnt1);
7525 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
7526 andl(cnt1,0x00000007); //tail count (in chars)
7527
7528 bind(SCAN_TO_8_CHAR_LOOP);
7529 movdqu(vec3, Address(result, 0));
7530 pcmpeqw(vec3, vec1);
7531 ptest(vec2, vec3);
7532 jcc(Assembler::carryClear, FOUND_CHAR);
7533 addptr(result, 16);
7534 subl(tmp, stride);
7535 jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
7536 bind(SCAN_TO_CHAR);
7537 testl(cnt1, cnt1);
7538 jcc(Assembler::zero, RET_NOT_FOUND);
7539 bind(SCAN_TO_CHAR_LOOP);
7540 load_unsigned_short(tmp, Address(result, 0));
7541 cmpl(ch, tmp);
7542 jccb(Assembler::equal, FOUND_SEQ_CHAR);
7543 addptr(result, 2);
7544 subl(cnt1, 1);
7545 jccb(Assembler::zero, RET_NOT_FOUND);
7546 jmp(SCAN_TO_CHAR_LOOP);
7547
7548 bind(RET_NOT_FOUND);
7549 movl(result, -1);
7550 jmpb(DONE_LABEL);
7551
7552 bind(FOUND_CHAR);
7553 if (UseAVX >= 2) {
7554 vpmovmskb(tmp, vec3);
7555 } else {
7556 pmovmskb(tmp, vec3);
7557 }
7558 bsfl(ch, tmp);
7559 addl(result, ch);
7560
7561 bind(FOUND_SEQ_CHAR);
7562 subptr(result, str1);
7563 shrl(result, 1);
7564
7565 bind(DONE_LABEL);
7566 } // string_indexof_char
7567
7568 // helper function for string_compare
7569 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
7570 Address::ScaleFactor scale, Address::ScaleFactor scale1,
7571 Address::ScaleFactor scale2, Register index, int ae) {
7572 if (ae == StrIntrinsicNode::LL) {
7573 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
7574 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
7575 } else if (ae == StrIntrinsicNode::UU) {
7576 load_unsigned_short(elem1, Address(str1, index, scale, 0));
7577 load_unsigned_short(elem2, Address(str2, index, scale, 0));
7578 } else {
7579 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
7628
7629 // Check if the strings start at the same location and setup scale and stride
7630 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7631 cmpptr(str1, str2);
7632 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7633 if (ae == StrIntrinsicNode::LL) {
7634 scale = Address::times_1;
7635 stride = 16;
7636 } else {
7637 scale = Address::times_2;
7638 stride = 8;
7639 }
7640 } else {
7641 scale = Address::no_scale; // not used
7642 scale1 = Address::times_1;
7643 scale2 = Address::times_2;
7644 stride = 8;
7645 }
7646
7647 if (UseAVX >= 2 && UseSSE42Intrinsics) {
7648 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7649 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
7650 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
7651 Label COMPARE_TAIL_LONG;
7652 int pcmpmask = 0x19;
7653 if (ae == StrIntrinsicNode::LL) {
7654 pcmpmask &= ~0x01;
7655 }
7656
7657 // Setup to compare 16-chars (32-bytes) vectors,
7658 // start from first character again because it has aligned address.
7659 if (ae == StrIntrinsicNode::LL) {
7660 stride2 = 32;
7661 } else {
7662 stride2 = 16;
7663 }
7664 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7665 adr_stride = stride << scale;
7666 } else {
7667 adr_stride1 = 8; //stride << scale1;
7668 adr_stride2 = 16; //stride << scale2;
7764 movdqu(vec1, Address(str1, 0));
7765 } else {
7766 pmovzxbw(vec1, Address(str1, 0));
7767 }
7768 pcmpestri(vec1, Address(str2, 0), pcmpmask);
7769 jcc(Assembler::below, COMPARE_INDEX_CHAR);
7770 subptr(cnt2, stride);
7771 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
7772 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7773 lea(str1, Address(str1, result, scale));
7774 lea(str2, Address(str2, result, scale));
7775 } else {
7776 lea(str1, Address(str1, result, scale1));
7777 lea(str2, Address(str2, result, scale2));
7778 }
7779 negptr(cnt2);
7780 jmpb(WHILE_HEAD_LABEL);
7781
7782 bind(COMPARE_SMALL_STR);
7783 } else if (UseSSE42Intrinsics) {
7784 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7785 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
7786 int pcmpmask = 0x19;
7787 // Setup to compare 8-char (16-byte) vectors,
7788 // start from first character again because it has aligned address.
7789 movl(result, cnt2);
7790 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
7791 if (ae == StrIntrinsicNode::LL) {
7792 pcmpmask &= ~0x01;
7793 }
7794 jccb(Assembler::zero, COMPARE_TAIL);
7795 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7796 lea(str1, Address(str1, result, scale));
7797 lea(str2, Address(str2, result, scale));
7798 } else {
7799 lea(str1, Address(str1, result, scale1));
7800 lea(str2, Address(str2, result, scale2));
7801 }
7802 negptr(result);
7803
7804 // pcmpestri
7897 // Search for Non-ASCII character (Negative byte value) in a byte array,
7898 // return true if it has any and false otherwise.
7899 void MacroAssembler::has_negatives(Register ary1, Register len,
7900 Register result, Register tmp1,
7901 XMMRegister vec1, XMMRegister vec2) {
7902
7903 // rsi: byte array
7904 // rcx: len
7905 // rax: result
7906 ShortBranchVerifier sbv(this);
7907 assert_different_registers(ary1, len, result, tmp1);
7908 assert_different_registers(vec1, vec2);
7909 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
7910
7911 // len == 0
7912 testl(len, len);
7913 jcc(Assembler::zero, FALSE_LABEL);
7914
7915 movl(result, len); // copy
7916
7917 if (UseAVX >= 2 && UseSSE42Intrinsics) {
7918 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7919 // With AVX2, use 32-byte vector compare
7920 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7921
7922 // Compare 32-byte vectors
7923 andl(result, 0x0000001f); // tail count (in bytes)
7924 andl(len, 0xffffffe0); // vector count (in bytes)
7925 jccb(Assembler::zero, COMPARE_TAIL);
7926
7927 lea(ary1, Address(ary1, len, Address::times_1));
7928 negptr(len);
7929
7930 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
7931 movdl(vec2, tmp1);
7932 vpbroadcastd(vec2, vec2);
7933
7934 bind(COMPARE_WIDE_VECTORS);
7935 vmovdqu(vec1, Address(ary1, len, Address::times_1));
7936 vptest(vec1, vec2);
7937 jccb(Assembler::notZero, TRUE_LABEL);
7938 addptr(len, 32);
7939 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7940
7941 testl(result, result);
7942 jccb(Assembler::zero, FALSE_LABEL);
7943
7944 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7945 vptest(vec1, vec2);
7946 jccb(Assembler::notZero, TRUE_LABEL);
7947 jmpb(FALSE_LABEL);
7948
7949 bind(COMPARE_TAIL); // len is zero
7950 movl(len, result);
7951 // Fallthru to tail compare
7952 } else if (UseSSE42Intrinsics) {
7953 assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available");
7954 // With SSE4.2, use double quad vector compare
7955 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7956
7957 // Compare 16-byte vectors
7958 andl(result, 0x0000000f); // tail count (in bytes)
7959 andl(len, 0xfffffff0); // vector count (in bytes)
7960 jccb(Assembler::zero, COMPARE_TAIL);
7961
7962 lea(ary1, Address(ary1, len, Address::times_1));
7963 negptr(len);
7964
7965 movl(tmp1, 0x80808080);
7966 movdl(vec2, tmp1);
7967 pshufd(vec2, vec2, 0);
7968
7969 bind(COMPARE_WIDE_VECTORS);
7970 movdqu(vec1, Address(ary1, len, Address::times_1));
7971 ptest(vec1, vec2);
7972 jccb(Assembler::notZero, TRUE_LABEL);
7973 addptr(len, 16);
8010 subptr(result, 2);
8011 lea(ary1, Address(ary1, 2));
8012
8013 bind(COMPARE_BYTE);
8014 testl(result, 0x1); // tail byte
8015 jccb(Assembler::zero, FALSE_LABEL);
8016 load_unsigned_byte(tmp1, Address(ary1, 0));
8017 andl(tmp1, 0x00000080);
8018 jccb(Assembler::notEqual, TRUE_LABEL);
8019 jmpb(FALSE_LABEL);
8020
8021 bind(TRUE_LABEL);
8022 movl(result, 1); // return true
8023 jmpb(DONE);
8024
8025 bind(FALSE_LABEL);
8026 xorl(result, result); // return false
8027
8028 // That's it
8029 bind(DONE);
8030 if (UseAVX >= 2 && UseSSE42Intrinsics) {
8031 // clean upper bits of YMM registers
8032 vpxor(vec1, vec1);
8033 vpxor(vec2, vec2);
8034 }
8035 }
8036
8037 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
8038 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
8039 Register limit, Register result, Register chr,
8040 XMMRegister vec1, XMMRegister vec2, bool is_char) {
8041 ShortBranchVerifier sbv(this);
8042 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
8043
8044 int length_offset = arrayOopDesc::length_offset_in_bytes();
8045 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
8046
8047 if (is_array_equ) {
8048 // Check the input args
8049 cmpptr(ary1, ary2);
8050 jcc(Assembler::equal, TRUE_LABEL);
8098 vptest(vec1, vec1);
8099 jccb(Assembler::notZero, FALSE_LABEL);
8100 addptr(limit, 32);
8101 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8102
8103 testl(result, result);
8104 jccb(Assembler::zero, TRUE_LABEL);
8105
8106 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8107 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
8108 vpxor(vec1, vec2);
8109
8110 vptest(vec1, vec1);
8111 jccb(Assembler::notZero, FALSE_LABEL);
8112 jmpb(TRUE_LABEL);
8113
8114 bind(COMPARE_TAIL); // limit is zero
8115 movl(limit, result);
8116 // Fallthru to tail compare
8117 } else if (UseSSE42Intrinsics) {
8118 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8119 // With SSE4.2, use double quad vector compare
8120 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8121
8122 // Compare 16-byte vectors
8123 andl(result, 0x0000000f); // tail count (in bytes)
8124 andl(limit, 0xfffffff0); // vector count (in bytes)
8125 jccb(Assembler::zero, COMPARE_TAIL);
8126
8127 lea(ary1, Address(ary1, limit, Address::times_1));
8128 lea(ary2, Address(ary2, limit, Address::times_1));
8129 negptr(limit);
8130
8131 bind(COMPARE_WIDE_VECTORS);
8132 movdqu(vec1, Address(ary1, limit, Address::times_1));
8133 movdqu(vec2, Address(ary2, limit, Address::times_1));
8134 pxor(vec1, vec2);
8135
8136 ptest(vec1, vec1);
8137 jccb(Assembler::notZero, FALSE_LABEL);
8138 addptr(limit, 16);
8448 // rdx: len
8449 // rcx: tmp5
8450 // rax: result
8451 ShortBranchVerifier sbv(this);
8452 assert_different_registers(src, dst, len, tmp5, result);
8453 Label L_done, L_copy_1_char, L_copy_1_char_exit;
8454
8455 // set result
8456 xorl(result, result);
8457 // check for zero length
8458 testl(len, len);
8459 jcc(Assembler::zero, L_done);
8460 movl(result, len);
8461
8462 // Setup pointers
8463 lea(src, Address(src, len, Address::times_2)); // char[]
8464 lea(dst, Address(dst, len, Address::times_1)); // byte[]
8465 negptr(len);
8466
8467 if (UseSSE42Intrinsics || UseAVX >= 2) {
8468 assert(UseSSE42Intrinsics ? UseSSE >= 4 : true, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8469 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8470 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8471
8472 if (UseAVX >= 2) {
8473 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8474 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
8475 movdl(tmp1Reg, tmp5);
8476 vpbroadcastd(tmp1Reg, tmp1Reg);
8477 jmpb(L_chars_32_check);
8478
8479 bind(L_copy_32_chars);
8480 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8481 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8482 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8483 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8484 jccb(Assembler::notZero, L_copy_32_chars_exit);
8485 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8486 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8487 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8488
10221 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10222 Register tmp5, Register result) {
10223 Label copy_chars_loop, return_length, return_zero, done;
10224
10225 // rsi: src
10226 // rdi: dst
10227 // rdx: len
10228 // rcx: tmp5
10229 // rax: result
10230
10231 // rsi holds start addr of source char[] to be compressed
10232 // rdi holds start addr of destination byte[]
10233 // rdx holds length
10234
10235 assert(len != result, "");
10236
10237 // save length for return
10238 push(len);
10239
10240 if (UseSSE42Intrinsics) {
10241 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10242 Label copy_32_loop, copy_16, copy_tail;
10243
10244 movl(result, len);
10245 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
10246
10247 // vectored compression
10248 andl(len, 0xfffffff0); // vector count (in chars)
10249 andl(result, 0x0000000f); // tail count (in chars)
10250 testl(len, len);
10251 jccb(Assembler::zero, copy_16);
10252
10253 // compress 16 chars per iter
10254 movdl(tmp1Reg, tmp5);
10255 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
10256 pxor(tmp4Reg, tmp4Reg);
10257
10258 lea(src, Address(src, len, Address::times_2));
10259 lea(dst, Address(dst, len, Address::times_1));
10260 negptr(len);
10261
10321
10322 bind(done);
10323 }
10324
10325 // Inflate byte[] array to char[].
10326 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10327 XMMRegister tmp1, Register tmp2) {
10328 Label copy_chars_loop, done;
10329
10330 // rsi: src
10331 // rdi: dst
10332 // rdx: len
10333 // rcx: tmp2
10334
10335 // rsi holds start addr of source byte[] to be inflated
10336 // rdi holds start addr of destination char[]
10337 // rdx holds length
10338 assert_different_registers(src, dst, len, tmp2);
10339
10340 if (UseSSE42Intrinsics) {
10341 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10342 Label copy_8_loop, copy_bytes, copy_tail;
10343
10344 movl(tmp2, len);
10345 andl(tmp2, 0x00000007); // tail count (in chars)
10346 andl(len, 0xfffffff8); // vector count (in chars)
10347 jccb(Assembler::zero, copy_tail);
10348
10349 // vectored inflation
10350 lea(src, Address(src, len, Address::times_1));
10351 lea(dst, Address(dst, len, Address::times_2));
10352 negptr(len);
10353
10354 // inflate 8 chars per iter
10355 bind(copy_8_loop);
10356 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
10357 movdqu(Address(dst, len, Address::times_2), tmp1);
10358 addptr(len, 8);
10359 jcc(Assembler::notZero, copy_8_loop);
10360
10361 bind(copy_tail);
|