8238 }
8239 subl(result, cnt1);
8240 jmpb(POP_LABEL);
8241 }//if (VM_Version::supports_avx512vlbw())
8242 #endif // _LP64
8243
8244 // Discard the stored length difference
8245 bind(POP_LABEL);
8246 pop(cnt1);
8247
8248 // That's it
8249 bind(DONE_LABEL);
8250 if(ae == StrIntrinsicNode::UL) {
8251 negl(result);
8252 }
8253
8254 }
8255
8256 // Search for Non-ASCII character (Negative byte value) in a byte array,
8257 // return true if it has any and false otherwise.
8258 void MacroAssembler::has_negatives(Register ary1, Register len,
8259 Register result, Register tmp1,
8260 XMMRegister vec1, XMMRegister vec2) {
8261
8262 // rsi: byte array
8263 // rcx: len
8264 // rax: result
8265 ShortBranchVerifier sbv(this);
8266 assert_different_registers(ary1, len, result, tmp1);
8267 assert_different_registers(vec1, vec2);
8268 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
8269
8270 // len == 0
8271 testl(len, len);
8272 jcc(Assembler::zero, FALSE_LABEL);
8273
8274 movl(result, len); // copy
8275
8276 if (UseAVX >= 2 && UseSSE >= 2) {
8277 // With AVX2, use 32-byte vector compare
8278 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8279
8280 // Compare 32-byte vectors
8281 andl(result, 0x0000001f); // tail count (in bytes)
8282 andl(len, 0xffffffe0); // vector count (in bytes)
8283 jccb(Assembler::zero, COMPARE_TAIL);
8284
8285 lea(ary1, Address(ary1, len, Address::times_1));
8286 negptr(len);
8287
8288 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
8289 movdl(vec2, tmp1);
8290 vpbroadcastd(vec2, vec2);
8291
8292 bind(COMPARE_WIDE_VECTORS);
8293 vmovdqu(vec1, Address(ary1, len, Address::times_1));
8294 vptest(vec1, vec2);
8295 jccb(Assembler::notZero, TRUE_LABEL);
8296 addptr(len, 32);
8326
8327 bind(COMPARE_WIDE_VECTORS);
8328 movdqu(vec1, Address(ary1, len, Address::times_1));
8329 ptest(vec1, vec2);
8330 jccb(Assembler::notZero, TRUE_LABEL);
8331 addptr(len, 16);
8332 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8333
8334 testl(result, result);
8335 jccb(Assembler::zero, FALSE_LABEL);
8336
8337 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8338 ptest(vec1, vec2);
8339 jccb(Assembler::notZero, TRUE_LABEL);
8340 jmpb(FALSE_LABEL);
8341
8342 bind(COMPARE_TAIL); // len is zero
8343 movl(len, result);
8344 // Fallthru to tail compare
8345 }
8346
8347 // Compare 4-byte vectors
8348 andl(len, 0xfffffffc); // vector count (in bytes)
8349 jccb(Assembler::zero, COMPARE_CHAR);
8350
8351 lea(ary1, Address(ary1, len, Address::times_1));
8352 negptr(len);
8353
8354 bind(COMPARE_VECTORS);
8355 movl(tmp1, Address(ary1, len, Address::times_1));
8356 andl(tmp1, 0x80808080);
8357 jccb(Assembler::notZero, TRUE_LABEL);
8358 addptr(len, 4);
8359 jcc(Assembler::notZero, COMPARE_VECTORS);
8360
8361 // Compare trailing char (final 2 bytes), if any
8362 bind(COMPARE_CHAR);
8363 testl(result, 0x2); // tail char
8364 jccb(Assembler::zero, COMPARE_BYTE);
8365 load_unsigned_short(tmp1, Address(ary1, 0));
8366 andl(tmp1, 0x00008080);
8820 testl(count, 1<<(shift-1));
8821 jccb(Assembler::zero, L_fill_byte);
8822 movw(Address(to, 0), value);
8823 if (t == T_BYTE) {
8824 addptr(to, 2);
8825 BIND(L_fill_byte);
8826 // fill trailing byte
8827 testl(count, 1);
8828 jccb(Assembler::zero, L_exit);
8829 movb(Address(to, 0), value);
8830 } else {
8831 BIND(L_fill_byte);
8832 }
8833 } else {
8834 BIND(L_fill_2_bytes);
8835 }
8836 BIND(L_exit);
8837 }
8838
8839 // encode char[] to byte[] in ISO_8859_1
8840 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8841 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8842 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8843 Register tmp5, Register result) {
8844 // rsi: src
8845 // rdi: dst
8846 // rdx: len
8847 // rcx: tmp5
8848 // rax: result
8849 ShortBranchVerifier sbv(this);
8850 assert_different_registers(src, dst, len, tmp5, result);
8851 Label L_done, L_copy_1_char, L_copy_1_char_exit;
8852
8853 // set result
8854 xorl(result, result);
8855 // check for zero length
8856 testl(len, len);
8857 jcc(Assembler::zero, L_done);
8858 movl(result, len);
8859
8860 // Setup pointers
8861 lea(src, Address(src, len, Address::times_2)); // char[]
8862 lea(dst, Address(dst, len, Address::times_1)); // byte[]
8863 negptr(len);
8864
8865 if (UseSSE42Intrinsics || UseAVX >= 2) {
8866 assert(UseSSE42Intrinsics ? UseSSE >= 4 : true, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8867 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8868 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8869
8870 if (UseAVX >= 2) {
8871 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8872 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
8873 movdl(tmp1Reg, tmp5);
8874 vpbroadcastd(tmp1Reg, tmp1Reg);
8875 jmpb(L_chars_32_check);
8876
8877 bind(L_copy_32_chars);
8946 packuswb(tmp3Reg, tmp1Reg);
8947 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8948 addptr(len, 8);
8949 jccb(Assembler::lessEqual, L_copy_8_chars);
8950
8951 bind(L_copy_8_chars_exit);
8952 subptr(len, 8);
8953 jccb(Assembler::zero, L_done);
8954 }
8955
8956 bind(L_copy_1_char);
8957 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8958 testl(tmp5, 0xff00); // check if Unicode char
8959 jccb(Assembler::notZero, L_copy_1_char_exit);
8960 movb(Address(dst, len, Address::times_1, 0), tmp5);
8961 addptr(len, 1);
8962 jccb(Assembler::less, L_copy_1_char);
8963
8964 bind(L_copy_1_char_exit);
8965 addptr(result, len); // len is negative count of not processed elements
8966 bind(L_done);
8967 }
8968
8969 #ifdef _LP64
8970 /**
8971 * Helper for multiply_to_len().
8972 */
8973 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
8974 addq(dest_lo, src1);
8975 adcq(dest_hi, 0);
8976 addq(dest_lo, src2);
8977 adcq(dest_hi, 0);
8978 }
8979
8980 /**
8981 * Multiply 64 bit by 64 bit first loop.
8982 */
8983 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
8984 Register y, Register y_idx, Register z,
8985 Register carry, Register product,
10766
10767 BIND(L_byteByByteProlog);
10768 andl(in2, 0x00000007);
10769 movl(tmp2, 1);
10770
10771 BIND(L_byteByByte);
10772 cmpl(tmp2, in2);
10773 jccb(Assembler::greater, L_exit);
10774 movb(tmp1, Address(in1, 0));
10775 crc32(in_out, tmp1, 1);
10776 incl(in1);
10777 incl(tmp2);
10778 jmp(L_byteByByte);
10779
10780 BIND(L_exit);
10781 }
10782 #endif // LP64
10783 #undef BIND
10784 #undef BLOCK_COMMENT
10785
10786
10787 // Compress char[] array to byte[].
10788 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10789 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10790 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10791 Register tmp5, Register result) {
10792 Label copy_chars_loop, return_length, return_zero, done;
10793
10794 // rsi: src
10795 // rdi: dst
10796 // rdx: len
10797 // rcx: tmp5
10798 // rax: result
10799
10800 // rsi holds start addr of source char[] to be compressed
10801 // rdi holds start addr of destination byte[]
10802 // rdx holds length
10803
10804 assert(len != result, "");
10805
10806 // save length for return
10807 push(len);
10808
10809 if (UseSSE42Intrinsics) {
10810 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10811 Label copy_32_loop, copy_16, copy_tail;
10812
10813 movl(result, len);
10814 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
10815
10816 // vectored compression
10817 andl(len, 0xfffffff0); // vector count (in chars)
10818 andl(result, 0x0000000f); // tail count (in chars)
10819 testl(len, len);
10820 jccb(Assembler::zero, copy_16);
10821
10822 // compress 16 chars per iter
10823 movdl(tmp1Reg, tmp5);
10824 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
10825 pxor(tmp4Reg, tmp4Reg);
10826
10827 lea(src, Address(src, len, Address::times_2));
10828 lea(dst, Address(dst, len, Address::times_1));
10829 negptr(len);
10830
10831 bind(copy_32_loop);
10832 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
10833 por(tmp4Reg, tmp2Reg);
10875 testl(result, 0xff00); // check if Unicode char
10876 jccb(Assembler::notZero, return_zero);
10877 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte
10878 increment(len);
10879 jcc(Assembler::notZero, copy_chars_loop);
10880
10881 // if compression succeeded, return length
10882 bind(return_length);
10883 pop(result);
10884 jmpb(done);
10885
10886 // if compression failed, return 0
10887 bind(return_zero);
10888 xorl(result, result);
10889 addptr(rsp, wordSize);
10890
10891 bind(done);
10892 }
10893
10894 // Inflate byte[] array to char[].
10895 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10896 XMMRegister tmp1, Register tmp2) {
10897 Label copy_chars_loop, done;
10898
10899 // rsi: src
10900 // rdi: dst
10901 // rdx: len
10902 // rcx: tmp2
10903
10904 // rsi holds start addr of source byte[] to be inflated
10905 // rdi holds start addr of destination char[]
10906 // rdx holds length
10907 assert_different_registers(src, dst, len, tmp2);
10908
10909 if (UseSSE42Intrinsics) {
10910 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10911 Label copy_8_loop, copy_bytes, copy_tail;
10912
10913 movl(tmp2, len);
10914 andl(tmp2, 0x00000007); // tail count (in chars)
10915 andl(len, 0xfffffff8); // vector count (in chars)
10916 jccb(Assembler::zero, copy_tail);
10917
10918 // vectored inflation
10919 lea(src, Address(src, len, Address::times_1));
10920 lea(dst, Address(dst, len, Address::times_2));
10921 negptr(len);
10922
10923 // inflate 8 chars per iter
10924 bind(copy_8_loop);
10925 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
10926 movdqu(Address(dst, len, Address::times_2), tmp1);
10927 addptr(len, 8);
10928 jcc(Assembler::notZero, copy_8_loop);
10929
10930 bind(copy_tail);
10931 movl(len, tmp2);
10932
10933 cmpl(len, 4);
10934 jccb(Assembler::less, copy_bytes);
10935
10936 movdl(tmp1, Address(src, 0)); // load 4 byte chars
10937 pmovzxbw(tmp1, tmp1);
10938 movq(Address(dst, 0), tmp1);
10939 subptr(len, 4);
10940 addptr(src, 4);
10941 addptr(dst, 8);
10942
10943 bind(copy_bytes);
10944 }
10945 testl(len, len);
10946 jccb(Assembler::zero, done);
10947 lea(src, Address(src, len, Address::times_1));
10948 lea(dst, Address(dst, len, Address::times_2));
10949 negptr(len);
10950
10951 // inflate 1 char per iter
10952 bind(copy_chars_loop);
10953 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
10954 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
10955 increment(len);
10956 jcc(Assembler::notZero, copy_chars_loop);
10957
10958 bind(done);
10959 }
10960
10961
10962 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10963 switch (cond) {
10964 // Note some conditions are synonyms for others
10965 case Assembler::zero: return Assembler::notZero;
10966 case Assembler::notZero: return Assembler::zero;
10967 case Assembler::less: return Assembler::greaterEqual;
10968 case Assembler::lessEqual: return Assembler::greater;
10969 case Assembler::greater: return Assembler::lessEqual;
10970 case Assembler::greaterEqual: return Assembler::less;
10971 case Assembler::below: return Assembler::aboveEqual;
10972 case Assembler::belowEqual: return Assembler::above;
10973 case Assembler::above: return Assembler::belowEqual;
10974 case Assembler::aboveEqual: return Assembler::below;
10975 case Assembler::overflow: return Assembler::noOverflow;
10976 case Assembler::noOverflow: return Assembler::overflow;
10977 case Assembler::negative: return Assembler::positive;
10978 case Assembler::positive: return Assembler::negative;
10979 case Assembler::parity: return Assembler::noParity;
10980 case Assembler::noParity: return Assembler::parity;
|
8238 }
8239 subl(result, cnt1);
8240 jmpb(POP_LABEL);
8241 }//if (VM_Version::supports_avx512vlbw())
8242 #endif // _LP64
8243
8244 // Discard the stored length difference
8245 bind(POP_LABEL);
8246 pop(cnt1);
8247
8248 // That's it
8249 bind(DONE_LABEL);
8250 if(ae == StrIntrinsicNode::UL) {
8251 negl(result);
8252 }
8253
8254 }
8255
8256 // Search for Non-ASCII character (Negative byte value) in a byte array,
8257 // return true if it has any and false otherwise.
8258 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
8259 // @HotSpotIntrinsicCandidate
8260 // private static boolean hasNegatives(byte[] ba, int off, int len) {
8261 // for (int i = off; i < off + len; i++) {
8262 // if (ba[i] < 0) {
8263 // return true;
8264 // }
8265 // }
8266 // return false;
8267 // }
8268 void MacroAssembler::has_negatives(Register ary1, Register len,
8269 Register result, Register tmp1,
8270 XMMRegister vec1, XMMRegister vec2) {
8271 // rsi: byte array
8272 // rcx: len
8273 // rax: result
8274 ShortBranchVerifier sbv(this);
8275 assert_different_registers(ary1, len, result, tmp1);
8276 assert_different_registers(vec1, vec2);
8277 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
8278
8279 // len == 0
8280 testl(len, len);
8281 jcc(Assembler::zero, FALSE_LABEL);
8282
8283 if ((UseAVX > 2) && // AVX512
8284 VM_Version::supports_avx512vlbw() &&
8285 VM_Version::supports_bmi2()) {
8286
8287 set_vector_masking(); // opening of the stub context for programming mask registers
8288
8289 Label test_64_loop, test_tail;
8290 Register tmp3_aliased = len;
8291
8292 movl(tmp1, len);
8293 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
8294
8295 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
8296 andl(len, ~(64 - 1)); // vector count (in chars)
8297 jccb(Assembler::zero, test_tail);
8298
8299 lea(ary1, Address(ary1, len, Address::times_1));
8300 negptr(len);
8301
8302 bind(test_64_loop);
8303 // Check whether our 64 elements of size byte contain negatives
8304 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
8305 kortestql(k2, k2);
8306 jcc(Assembler::notZero, TRUE_LABEL);
8307
8308 addptr(len, 64);
8309 jccb(Assembler::notZero, test_64_loop);
8310
8311
8312 bind(test_tail);
8313 // bail out when there is nothing to be done
8314 testl(tmp1, -1);
8315 jcc(Assembler::zero, FALSE_LABEL);
8316
8317 // Save k1
8318 kmovql(k3, k1);
8319
8320 // ~(~0 << len) applied up to two times (for 32-bit scenario)
8321 #ifdef _LP64
8322 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
8323 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
8324 notq(tmp3_aliased);
8325 kmovql(k1, tmp3_aliased);
8326 #else
8327 Label k_init;
8328 jmp(k_init);
8329
8330 // We could not read 64-bits from a general purpose register thus we move
8331 // data required to compose 64 1's to the instruction stream
8332 // We emit 64 byte wide series of elements from 0..63 which later on would
8333 // be used as a compare targets with tail count contained in tmp1 register.
8334 // Result would be a k1 register having tmp1 consecutive number or 1
8335 // counting from least significant bit.
8336 address tmp = pc();
8337 emit_int64(0x0706050403020100);
8338 emit_int64(0x0F0E0D0C0B0A0908);
8339 emit_int64(0x1716151413121110);
8340 emit_int64(0x1F1E1D1C1B1A1918);
8341 emit_int64(0x2726252423222120);
8342 emit_int64(0x2F2E2D2C2B2A2928);
8343 emit_int64(0x3736353433323130);
8344 emit_int64(0x3F3E3D3C3B3A3938);
8345
8346 bind(k_init);
8347 lea(len, InternalAddress(tmp));
8348 // create mask to test for negative byte inside a vector
8349 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
8350 evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
8351
8352 #endif
8353 evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
8354 ktestq(k2, k1);
8355 // Restore k1
8356 kmovql(k1, k3);
8357 jcc(Assembler::notZero, TRUE_LABEL);
8358
8359 jmp(FALSE_LABEL);
8360
8361 clear_vector_masking(); // closing of the stub context for programming mask registers
8362 } else {
8363 movl(result, len); // copy
8364
8365 if (UseAVX == 2 && UseSSE >= 2) {
8366 // With AVX2, use 32-byte vector compare
8367 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8368
8369 // Compare 32-byte vectors
8370 andl(result, 0x0000001f); // tail count (in bytes)
8371 andl(len, 0xffffffe0); // vector count (in bytes)
8372 jccb(Assembler::zero, COMPARE_TAIL);
8373
8374 lea(ary1, Address(ary1, len, Address::times_1));
8375 negptr(len);
8376
8377 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
8378 movdl(vec2, tmp1);
8379 vpbroadcastd(vec2, vec2);
8380
8381 bind(COMPARE_WIDE_VECTORS);
8382 vmovdqu(vec1, Address(ary1, len, Address::times_1));
8383 vptest(vec1, vec2);
8384 jccb(Assembler::notZero, TRUE_LABEL);
8385 addptr(len, 32);
8415
8416 bind(COMPARE_WIDE_VECTORS);
8417 movdqu(vec1, Address(ary1, len, Address::times_1));
8418 ptest(vec1, vec2);
8419 jccb(Assembler::notZero, TRUE_LABEL);
8420 addptr(len, 16);
8421 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8422
8423 testl(result, result);
8424 jccb(Assembler::zero, FALSE_LABEL);
8425
8426 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8427 ptest(vec1, vec2);
8428 jccb(Assembler::notZero, TRUE_LABEL);
8429 jmpb(FALSE_LABEL);
8430
8431 bind(COMPARE_TAIL); // len is zero
8432 movl(len, result);
8433 // Fallthru to tail compare
8434 }
8435 }
8436 // Compare 4-byte vectors
8437 andl(len, 0xfffffffc); // vector count (in bytes)
8438 jccb(Assembler::zero, COMPARE_CHAR);
8439
8440 lea(ary1, Address(ary1, len, Address::times_1));
8441 negptr(len);
8442
8443 bind(COMPARE_VECTORS);
8444 movl(tmp1, Address(ary1, len, Address::times_1));
8445 andl(tmp1, 0x80808080);
8446 jccb(Assembler::notZero, TRUE_LABEL);
8447 addptr(len, 4);
8448 jcc(Assembler::notZero, COMPARE_VECTORS);
8449
8450 // Compare trailing char (final 2 bytes), if any
8451 bind(COMPARE_CHAR);
8452 testl(result, 0x2); // tail char
8453 jccb(Assembler::zero, COMPARE_BYTE);
8454 load_unsigned_short(tmp1, Address(ary1, 0));
8455 andl(tmp1, 0x00008080);
8909 testl(count, 1<<(shift-1));
8910 jccb(Assembler::zero, L_fill_byte);
8911 movw(Address(to, 0), value);
8912 if (t == T_BYTE) {
8913 addptr(to, 2);
8914 BIND(L_fill_byte);
8915 // fill trailing byte
8916 testl(count, 1);
8917 jccb(Assembler::zero, L_exit);
8918 movb(Address(to, 0), value);
8919 } else {
8920 BIND(L_fill_byte);
8921 }
8922 } else {
8923 BIND(L_fill_2_bytes);
8924 }
8925 BIND(L_exit);
8926 }
8927
8928 // encode char[] to byte[] in ISO_8859_1
8929 //@HotSpotIntrinsicCandidate
8930 //private static int implEncodeISOArray(byte[] sa, int sp,
8931 //byte[] da, int dp, int len) {
8932 // int i = 0;
8933 // for (; i < len; i++) {
8934 // char c = StringUTF16.getChar(sa, sp++);
8935 // if (c > '\u00FF')
8936 // break;
8937 // da[dp++] = (byte)c;
8938 // }
8939 // return i;
8940 //}
8941 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8942 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8943 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8944 Register tmp5, Register result) {
8945
8946 // rsi: src
8947 // rdi: dst
8948 // rdx: len
8949 // rcx: tmp5
8950 // rax: result
8951 ShortBranchVerifier sbv(this);
8952 assert_different_registers(src, dst, len, tmp5, result);
8953 Label L_done, L_copy_1_char, L_copy_1_char_exit;
8954
8955 // set result
8956 xorl(result, result);
8957 // check for zero length
8958 testl(len, len);
8959 jcc(Assembler::zero, L_done);
8960
8961 movl(result, len);
8962
8963 // Setup pointers
8964 lea(src, Address(src, len, Address::times_2)); // char[]
8965 lea(dst, Address(dst, len, Address::times_1)); // byte[]
8966 negptr(len);
8967
8968 if (UseSSE42Intrinsics || UseAVX >= 2) {
8969 assert(UseSSE42Intrinsics ? UseSSE >= 4 : true, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8970 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8971 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8972
8973 if (UseAVX >= 2) {
8974 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8975 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
8976 movdl(tmp1Reg, tmp5);
8977 vpbroadcastd(tmp1Reg, tmp1Reg);
8978 jmpb(L_chars_32_check);
8979
8980 bind(L_copy_32_chars);
9049 packuswb(tmp3Reg, tmp1Reg);
9050 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
9051 addptr(len, 8);
9052 jccb(Assembler::lessEqual, L_copy_8_chars);
9053
9054 bind(L_copy_8_chars_exit);
9055 subptr(len, 8);
9056 jccb(Assembler::zero, L_done);
9057 }
9058
9059 bind(L_copy_1_char);
9060 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
9061 testl(tmp5, 0xff00); // check if Unicode char
9062 jccb(Assembler::notZero, L_copy_1_char_exit);
9063 movb(Address(dst, len, Address::times_1, 0), tmp5);
9064 addptr(len, 1);
9065 jccb(Assembler::less, L_copy_1_char);
9066
9067 bind(L_copy_1_char_exit);
9068 addptr(result, len); // len is negative count of not processed elements
9069
9070 bind(L_done);
9071 }
9072
9073 #ifdef _LP64
9074 /**
9075 * Helper for multiply_to_len().
9076 */
9077 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
9078 addq(dest_lo, src1);
9079 adcq(dest_hi, 0);
9080 addq(dest_lo, src2);
9081 adcq(dest_hi, 0);
9082 }
9083
9084 /**
9085 * Multiply 64 bit by 64 bit first loop.
9086 */
9087 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
9088 Register y, Register y_idx, Register z,
9089 Register carry, Register product,
10870
10871 BIND(L_byteByByteProlog);
10872 andl(in2, 0x00000007);
10873 movl(tmp2, 1);
10874
10875 BIND(L_byteByByte);
10876 cmpl(tmp2, in2);
10877 jccb(Assembler::greater, L_exit);
10878 movb(tmp1, Address(in1, 0));
10879 crc32(in_out, tmp1, 1);
10880 incl(in1);
10881 incl(tmp2);
10882 jmp(L_byteByByte);
10883
10884 BIND(L_exit);
10885 }
10886 #endif // LP64
10887 #undef BIND
10888 #undef BLOCK_COMMENT
10889
10890 // Compress char[] array to byte[].
10891 // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
10892 // @HotSpotIntrinsicCandidate
10893 // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
10894 // for (int i = 0; i < len; i++) {
10895 // int c = src[srcOff++];
10896 // if (c >>> 8 != 0) {
10897 // return 0;
10898 // }
10899 // dst[dstOff++] = (byte)c;
10900 // }
10901 // return len;
10902 // }
10903 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10904 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10905 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10906 Register tmp5, Register result) {
10907 Label copy_chars_loop, return_length, return_zero, done, below_threshold;
10908
10909 // rsi: src
10910 // rdi: dst
10911 // rdx: len
10912 // rcx: tmp5
10913 // rax: result
10914
10915 // rsi holds start addr of source char[] to be compressed
10916 // rdi holds start addr of destination byte[]
10917 // rdx holds length
10918
10919 assert(len != result, "");
10920
10921 // save length for return
10922 push(len);
10923
10924 if ((UseAVX > 2) && // AVX512
10925 VM_Version::supports_avx512vlbw() &&
10926 VM_Version::supports_bmi2()) {
10927
10928 set_vector_masking(); // opening of the stub context for programming mask registers
10929
10930 Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
10931
10932 // alignement
10933 Label post_alignement;
10934
10935 // if length of the string is less than 16, handle it in an old fashioned
10936 // way
10937 testl(len, -32);
10938 jcc(Assembler::zero, below_threshold);
10939
10940 // First check whether a character is compressable ( <= 0xFF).
10941 // Create mask to test for Unicode chars inside zmm vector
10942 movl(result, 0x00FF);
10943 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
10944
10945 testl(len, -64);
10946 jcc(Assembler::zero, post_alignement);
10947
10948 // Save k1
10949 kmovql(k3, k1);
10950
10951 movl(tmp5, dst);
10952 andl(tmp5, (64 - 1));
10953 negl(tmp5);
10954 andl(tmp5, (64 - 1));
10955
10956 // bail out when there is nothing to be done
10957 testl(tmp5, 0xFFFFFFFF);
10958 jcc(Assembler::zero, post_alignement);
10959
10960 // ~(~0 << len), where len is the # of remaining elements to process
10961 movl(result, 0xFFFFFFFF);
10962 shlxl(result, result, tmp5);
10963 notl(result);
10964
10965 kmovdl(k1, result);
10966
10967 evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10968 evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
10969 ktestd(k2, k1);
10970 jcc(Assembler::carryClear, copy_just_portion_of_candidates);
10971
10972 evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10973
10974 addptr(src, tmp5);
10975 addptr(src, tmp5);
10976 addptr(dst, tmp5);
10977 subl(len, tmp5);
10978
10979 bind(post_alignement);
10980 // end of alignement
10981
10982 movl(tmp5, len);
10983 andl(tmp5, (32 - 1)); // tail count (in chars)
10984 andl(len, ~(32 - 1)); // vector count (in chars)
10985 jcc(Assembler::zero, copy_loop_tail);
10986
10987 lea(src, Address(src, len, Address::times_2));
10988 lea(dst, Address(dst, len, Address::times_1));
10989 negptr(len);
10990
10991 bind(copy_32_loop);
10992 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
10993 evpcmpuw(k2, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
10994 kortestdl(k2, k2);
10995 jcc(Assembler::carryClear, copy_just_portion_of_candidates);
10996
10997 // All elements in current processed chunk are valid candidates for
10998 // compression. Write a truncated byte elements to the memory.
10999 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
11000 addptr(len, 32);
11001 jcc(Assembler::notZero, copy_32_loop);
11002
11003 bind(copy_loop_tail);
11004 // bail out when there is nothing to be done
11005 testl(tmp5, 0xFFFFFFFF);
11006 jcc(Assembler::zero, return_length);
11007
11008 // Save k1
11009 kmovql(k3, k1);
11010
11011 movl(len, tmp5);
11012
11013 // ~(~0 << len), where len is the # of remaining elements to process
11014 movl(result, 0xFFFFFFFF);
11015 shlxl(result, result, len);
11016 notl(result);
11017
11018 kmovdl(k1, result);
11019
11020 evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
11021 evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
11022 ktestd(k2, k1);
11023 jcc(Assembler::carryClear, copy_just_portion_of_candidates);
11024
11025 evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
11026 // Restore k1
11027 kmovql(k1, k3);
11028
11029 jmp(return_length);
11030
11031 bind(copy_just_portion_of_candidates);
11032 kmovdl(tmp5, k2);
11033 tzcntl(tmp5, tmp5);
11034
11035 // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
11036 // result to the first element larger than 0xFF
11037 movl(result, 0xFFFFFFFF);
11038 shlxl(result, result, tmp5);
11039 notl(result);
11040
11041 kmovdl(k1, result);
11042
11043 evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
11044 // Restore k1
11045 kmovql(k1, k3);
11046
11047 jmp(return_zero);
11048
11049 clear_vector_masking(); // closing of the stub context for programming mask registers
11050 }
11051 if (UseSSE42Intrinsics) {
11052 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
11053 Label copy_32_loop, copy_16, copy_tail;
11054
11055 bind(below_threshold);
11056
11057 movl(result, len);
11058
11059 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
11060
11061 // vectored compression
11062 andl(len, 0xfffffff0); // vector count (in chars)
11063 andl(result, 0x0000000f); // tail count (in chars)
11064 testl(len, len);
11065 jccb(Assembler::zero, copy_16);
11066
11067 // compress 16 chars per iter
11068 movdl(tmp1Reg, tmp5);
11069 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
11070 pxor(tmp4Reg, tmp4Reg);
11071
11072 lea(src, Address(src, len, Address::times_2));
11073 lea(dst, Address(dst, len, Address::times_1));
11074 negptr(len);
11075
11076 bind(copy_32_loop);
11077 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
11078 por(tmp4Reg, tmp2Reg);
11120 testl(result, 0xff00); // check if Unicode char
11121 jccb(Assembler::notZero, return_zero);
11122 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte
11123 increment(len);
11124 jcc(Assembler::notZero, copy_chars_loop);
11125
11126 // if compression succeeded, return length
11127 bind(return_length);
11128 pop(result);
11129 jmpb(done);
11130
11131 // if compression failed, return 0
11132 bind(return_zero);
11133 xorl(result, result);
11134 addptr(rsp, wordSize);
11135
11136 bind(done);
11137 }
11138
11139 // Inflate byte[] array to char[].
11140 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
11141 // @HotSpotIntrinsicCandidate
11142 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
11143 // for (int i = 0; i < len; i++) {
11144 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
11145 // }
11146 // }
11147 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
11148 XMMRegister tmp1, Register tmp2) {
11149 Label copy_chars_loop, done, below_threshold;
11150 // rsi: src
11151 // rdi: dst
11152 // rdx: len
11153 // rcx: tmp2
11154
11155 // rsi holds start addr of source byte[] to be inflated
11156 // rdi holds start addr of destination char[]
11157 // rdx holds length
11158 assert_different_registers(src, dst, len, tmp2);
11159
11160 if ((UseAVX > 2) && // AVX512
11161 VM_Version::supports_avx512vlbw() &&
11162 VM_Version::supports_bmi2()) {
11163
11164 set_vector_masking(); // opening of the stub context for programming mask registers
11165
11166 Label copy_32_loop, copy_tail;
11167 Register tmp3_aliased = len;
11168
11169 // if length of the string is less than 16, handle it in an old fashioned
11170 // way
11171 testl(len, -16);
11172 jcc(Assembler::zero, below_threshold);
11173
11174 // In order to use only one arithmetic operation for the main loop we use
11175 // this pre-calculation
11176 movl(tmp2, len);
11177 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
11178 andl(len, -32); // vector count
11179 jccb(Assembler::zero, copy_tail);
11180
11181 lea(src, Address(src, len, Address::times_1));
11182 lea(dst, Address(dst, len, Address::times_2));
11183 negptr(len);
11184
11185
11186 // inflate 32 chars per iter
11187 bind(copy_32_loop);
11188 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
11189 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
11190 addptr(len, 32);
11191 jcc(Assembler::notZero, copy_32_loop);
11192
11193 bind(copy_tail);
11194 // bail out when there is nothing to be done
11195 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
11196 jcc(Assembler::zero, done);
11197
11198 // Save k1
11199 kmovql(k2, k1);
11200
11201 // ~(~0 << length), where length is the # of remaining elements to process
11202 movl(tmp3_aliased, -1);
11203 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
11204 notl(tmp3_aliased);
11205 kmovdl(k1, tmp3_aliased);
11206 evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
11207 evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
11208
11209 // Restore k1
11210 kmovql(k1, k2);
11211 jmp(done);
11212
11213 clear_vector_masking(); // closing of the stub context for programming mask registers
11214 }
11215 if (UseSSE42Intrinsics) {
11216 assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
11217 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
11218
11219 movl(tmp2, len);
11220
11221 if (UseAVX > 1) {
11222 andl(tmp2, (16 - 1));
11223 andl(len, -16);
11224 jccb(Assembler::zero, copy_new_tail);
11225 } else {
11226 andl(tmp2, 0x00000007); // tail count (in chars)
11227 andl(len, 0xfffffff8); // vector count (in chars)
11228 jccb(Assembler::zero, copy_tail);
11229 }
11230
11231 // vectored inflation
11232 lea(src, Address(src, len, Address::times_1));
11233 lea(dst, Address(dst, len, Address::times_2));
11234 negptr(len);
11235
11236 if (UseAVX > 1) {
11237 bind(copy_16_loop);
11238 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
11239 vmovdqu(Address(dst, len, Address::times_2), tmp1);
11240 addptr(len, 16);
11241 jcc(Assembler::notZero, copy_16_loop);
11242
11243 bind(below_threshold);
11244 bind(copy_new_tail);
11245 if (UseAVX > 2) {
11246 movl(tmp2, len);
11247 }
11248 else {
11249 movl(len, tmp2);
11250 }
11251 andl(tmp2, 0x00000007);
11252 andl(len, 0xFFFFFFF8);
11253 jccb(Assembler::zero, copy_tail);
11254
11255 pmovzxbw(tmp1, Address(src, 0));
11256 movdqu(Address(dst, 0), tmp1);
11257 addptr(src, 8);
11258 addptr(dst, 2 * 8);
11259
11260 jmp(copy_tail, true);
11261 }
11262
11263 // inflate 8 chars per iter
11264 bind(copy_8_loop);
11265 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
11266 movdqu(Address(dst, len, Address::times_2), tmp1);
11267 addptr(len, 8);
11268 jcc(Assembler::notZero, copy_8_loop);
11269
11270 bind(copy_tail);
11271 movl(len, tmp2);
11272
11273 cmpl(len, 4);
11274 jccb(Assembler::less, copy_bytes);
11275
11276 movdl(tmp1, Address(src, 0)); // load 4 byte chars
11277 pmovzxbw(tmp1, tmp1);
11278 movq(Address(dst, 0), tmp1);
11279 subptr(len, 4);
11280 addptr(src, 4);
11281 addptr(dst, 8);
11282
11283 bind(copy_bytes);
11284 }
11285 testl(len, len);
11286 jccb(Assembler::zero, done);
11287 lea(src, Address(src, len, Address::times_1));
11288 lea(dst, Address(dst, len, Address::times_2));
11289 negptr(len);
11290
11291 // inflate 1 char per iter
11292 bind(copy_chars_loop);
11293 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
11294 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
11295 increment(len);
11296 jcc(Assembler::notZero, copy_chars_loop);
11297
11298 bind(done);
11299 }
11300
11301 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
11302 switch (cond) {
11303 // Note some conditions are synonyms for others
11304 case Assembler::zero: return Assembler::notZero;
11305 case Assembler::notZero: return Assembler::zero;
11306 case Assembler::less: return Assembler::greaterEqual;
11307 case Assembler::lessEqual: return Assembler::greater;
11308 case Assembler::greater: return Assembler::lessEqual;
11309 case Assembler::greaterEqual: return Assembler::less;
11310 case Assembler::below: return Assembler::aboveEqual;
11311 case Assembler::belowEqual: return Assembler::above;
11312 case Assembler::above: return Assembler::belowEqual;
11313 case Assembler::aboveEqual: return Assembler::below;
11314 case Assembler::overflow: return Assembler::noOverflow;
11315 case Assembler::noOverflow: return Assembler::overflow;
11316 case Assembler::negative: return Assembler::positive;
11317 case Assembler::positive: return Assembler::negative;
11318 case Assembler::parity: return Assembler::noParity;
11319 case Assembler::noParity: return Assembler::parity;
|