< prev index next >

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page




8238     }
8239     subl(result, cnt1);
8240     jmpb(POP_LABEL);
8241   }//if (VM_Version::supports_avx512vlbw())
8242 #endif // _LP64
8243 
8244   // Discard the stored length difference
8245   bind(POP_LABEL);
8246   pop(cnt1);
8247 
8248   // That's it
8249   bind(DONE_LABEL);
8250   if(ae == StrIntrinsicNode::UL) {
8251     negl(result);
8252   }
8253 
8254 }
8255 
8256 // Search for Non-ASCII character (Negative byte value) in a byte array,
8257 // return true if it has any and false otherwise.










8258 void MacroAssembler::has_negatives(Register ary1, Register len,
8259                                    Register result, Register tmp1,
8260                                    XMMRegister vec1, XMMRegister vec2) {
8261 
8262   // rsi: byte array
8263   // rcx: len
8264   // rax: result
8265   ShortBranchVerifier sbv(this);
8266   assert_different_registers(ary1, len, result, tmp1);
8267   assert_different_registers(vec1, vec2);
8268   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
8269 
8270   // len == 0
8271   testl(len, len);
8272   jcc(Assembler::zero, FALSE_LABEL);
8273 
















































































8274   movl(result, len); // copy
8275 
8276   if (UseAVX >= 2 && UseSSE >= 2) {
8277     // With AVX2, use 32-byte vector compare
8278     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8279 
8280     // Compare 32-byte vectors
8281     andl(result, 0x0000001f);  //   tail count (in bytes)
8282     andl(len, 0xffffffe0);   // vector count (in bytes)
8283     jccb(Assembler::zero, COMPARE_TAIL);
8284 
8285     lea(ary1, Address(ary1, len, Address::times_1));
8286     negptr(len);
8287 
8288     movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
8289     movdl(vec2, tmp1);
8290     vpbroadcastd(vec2, vec2);
8291 
8292     bind(COMPARE_WIDE_VECTORS);
8293     vmovdqu(vec1, Address(ary1, len, Address::times_1));
8294     vptest(vec1, vec2);
8295     jccb(Assembler::notZero, TRUE_LABEL);
8296     addptr(len, 32);


8326 
8327     bind(COMPARE_WIDE_VECTORS);
8328     movdqu(vec1, Address(ary1, len, Address::times_1));
8329     ptest(vec1, vec2);
8330     jccb(Assembler::notZero, TRUE_LABEL);
8331     addptr(len, 16);
8332     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8333 
8334     testl(result, result);
8335     jccb(Assembler::zero, FALSE_LABEL);
8336 
8337     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8338     ptest(vec1, vec2);
8339     jccb(Assembler::notZero, TRUE_LABEL);
8340     jmpb(FALSE_LABEL);
8341 
8342     bind(COMPARE_TAIL); // len is zero
8343     movl(len, result);
8344     // Fallthru to tail compare
8345   }
8346 
8347   // Compare 4-byte vectors
8348   andl(len, 0xfffffffc); // vector count (in bytes)
8349   jccb(Assembler::zero, COMPARE_CHAR);
8350 
8351   lea(ary1, Address(ary1, len, Address::times_1));
8352   negptr(len);
8353 
8354   bind(COMPARE_VECTORS);
8355   movl(tmp1, Address(ary1, len, Address::times_1));
8356   andl(tmp1, 0x80808080);
8357   jccb(Assembler::notZero, TRUE_LABEL);
8358   addptr(len, 4);
8359   jcc(Assembler::notZero, COMPARE_VECTORS);
8360 
8361   // Compare trailing char (final 2 bytes), if any
8362   bind(COMPARE_CHAR);
8363   testl(result, 0x2);   // tail  char
8364   jccb(Assembler::zero, COMPARE_BYTE);
8365   load_unsigned_short(tmp1, Address(ary1, 0));
8366   andl(tmp1, 0x00008080);


8820     testl(count, 1<<(shift-1));
8821     jccb(Assembler::zero, L_fill_byte);
8822     movw(Address(to, 0), value);
8823     if (t == T_BYTE) {
8824       addptr(to, 2);
8825       BIND(L_fill_byte);
8826       // fill trailing byte
8827       testl(count, 1);
8828       jccb(Assembler::zero, L_exit);
8829       movb(Address(to, 0), value);
8830     } else {
8831       BIND(L_fill_byte);
8832     }
8833   } else {
8834     BIND(L_fill_2_bytes);
8835   }
8836   BIND(L_exit);
8837 }
8838 
8839 // encode char[] to byte[] in ISO_8859_1












8840 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8841                                       XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8842                                       XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8843                                       Register tmp5, Register result) {

8844   // rsi: src
8845   // rdi: dst
8846   // rdx: len
8847   // rcx: tmp5
8848   // rax: result
8849   ShortBranchVerifier sbv(this);
8850   assert_different_registers(src, dst, len, tmp5, result);
8851   Label L_done, L_copy_1_char, L_copy_1_char_exit;
8852 
8853   // set result
8854   xorl(result, result);
8855   // check for zero length
8856   testl(len, len);
8857   jcc(Assembler::zero, L_done);

8858   movl(result, len);
8859 
8860   // Setup pointers
8861   lea(src, Address(src, len, Address::times_2)); // char[]
8862   lea(dst, Address(dst, len, Address::times_1)); // byte[]
8863   negptr(len);
8864 
8865   if (UseSSE42Intrinsics || UseAVX >= 2) {
8866     assert(UseSSE42Intrinsics ? UseSSE >= 4 : true, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8867     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8868     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8869 
8870     if (UseAVX >= 2) {
8871       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8872       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8873       movdl(tmp1Reg, tmp5);
8874       vpbroadcastd(tmp1Reg, tmp1Reg);
8875       jmpb(L_chars_32_check);
8876 
8877       bind(L_copy_32_chars);


8946     packuswb(tmp3Reg, tmp1Reg);
8947     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8948     addptr(len, 8);
8949     jccb(Assembler::lessEqual, L_copy_8_chars);
8950 
8951     bind(L_copy_8_chars_exit);
8952     subptr(len, 8);
8953     jccb(Assembler::zero, L_done);
8954   }
8955 
8956   bind(L_copy_1_char);
8957   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8958   testl(tmp5, 0xff00);      // check if Unicode char
8959   jccb(Assembler::notZero, L_copy_1_char_exit);
8960   movb(Address(dst, len, Address::times_1, 0), tmp5);
8961   addptr(len, 1);
8962   jccb(Assembler::less, L_copy_1_char);
8963 
8964   bind(L_copy_1_char_exit);
8965   addptr(result, len); // len is negative count of not processed elements

8966   bind(L_done);
8967 }
8968 
8969 #ifdef _LP64
8970 /**
8971  * Helper for multiply_to_len().
8972  */
8973 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
8974   addq(dest_lo, src1);
8975   adcq(dest_hi, 0);
8976   addq(dest_lo, src2);
8977   adcq(dest_hi, 0);
8978 }
8979 
8980 /**
8981  * Multiply 64 bit by 64 bit first loop.
8982  */
8983 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
8984                                            Register y, Register y_idx, Register z,
8985                                            Register carry, Register product,


10766 
10767   BIND(L_byteByByteProlog);
10768   andl(in2, 0x00000007);
10769   movl(tmp2, 1);
10770 
10771   BIND(L_byteByByte);
10772   cmpl(tmp2, in2);
10773   jccb(Assembler::greater, L_exit);
10774     movb(tmp1, Address(in1, 0));
10775     crc32(in_out, tmp1, 1);
10776     incl(in1);
10777     incl(tmp2);
10778     jmp(L_byteByByte);
10779 
10780   BIND(L_exit);
10781 }
10782 #endif // LP64
10783 #undef BIND
10784 #undef BLOCK_COMMENT
10785 
10786 
10787 // Compress char[] array to byte[].












10788 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10789                                          XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10790                                          XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10791                                          Register tmp5, Register result) {
10792   Label copy_chars_loop, return_length, return_zero, done;
10793 
10794   // rsi: src
10795   // rdi: dst
10796   // rdx: len
10797   // rcx: tmp5
10798   // rax: result
10799 
10800   // rsi holds start addr of source char[] to be compressed
10801   // rdi holds start addr of destination byte[]
10802   // rdx holds length
10803 
10804   assert(len != result, "");
10805 
10806   // save length for return
10807   push(len);
10808 































































































































10809   if (UseSSE42Intrinsics) {
10810     assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10811     Label copy_32_loop, copy_16, copy_tail;
10812 


10813     movl(result, len);

10814     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10815 
10816     // vectored compression
10817     andl(len, 0xfffffff0);    // vector count (in chars)
10818     andl(result, 0x0000000f);    // tail count (in chars)
10819     testl(len, len);
10820     jccb(Assembler::zero, copy_16);
10821 
10822     // compress 16 chars per iter
10823     movdl(tmp1Reg, tmp5);
10824     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10825     pxor(tmp4Reg, tmp4Reg);
10826 
10827     lea(src, Address(src, len, Address::times_2));
10828     lea(dst, Address(dst, len, Address::times_1));
10829     negptr(len);
10830 
10831     bind(copy_32_loop);
10832     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10833     por(tmp4Reg, tmp2Reg);


10875   testl(result, 0xff00);      // check if Unicode char
10876   jccb(Assembler::notZero, return_zero);
10877   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
10878   increment(len);
10879   jcc(Assembler::notZero, copy_chars_loop);
10880 
10881   // if compression succeeded, return length
10882   bind(return_length);
10883   pop(result);
10884   jmpb(done);
10885 
10886   // if compression failed, return 0
10887   bind(return_zero);
10888   xorl(result, result);
10889   addptr(rsp, wordSize);
10890 
10891   bind(done);
10892 }
10893 
10894 // Inflate byte[] array to char[].







10895 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10896                                         XMMRegister tmp1, Register tmp2) {
10897   Label copy_chars_loop, done;
10898 
10899   // rsi: src
10900   // rdi: dst
10901   // rdx: len
10902   // rcx: tmp2
10903 
10904   // rsi holds start addr of source byte[] to be inflated
10905   // rdi holds start addr of destination char[]
10906   // rdx holds length
10907   assert_different_registers(src, dst, len, tmp2);
10908 























































10909   if (UseSSE42Intrinsics) {
10910     assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10911     Label copy_8_loop, copy_bytes, copy_tail;
10912 
10913     movl(tmp2, len);






10914     andl(tmp2, 0x00000007);   // tail count (in chars)
10915     andl(len, 0xfffffff8);    // vector count (in chars)
10916     jccb(Assembler::zero, copy_tail);

10917 
10918     // vectored inflation
10919     lea(src, Address(src, len, Address::times_1));
10920     lea(dst, Address(dst, len, Address::times_2));
10921     negptr(len);
10922 



























10923     // inflate 8 chars per iter
10924     bind(copy_8_loop);
10925     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
10926     movdqu(Address(dst, len, Address::times_2), tmp1);
10927     addptr(len, 8);
10928     jcc(Assembler::notZero, copy_8_loop);
10929 
10930     bind(copy_tail);
10931     movl(len, tmp2);
10932 
10933     cmpl(len, 4);
10934     jccb(Assembler::less, copy_bytes);
10935 
10936     movdl(tmp1, Address(src, 0));  // load 4 byte chars
10937     pmovzxbw(tmp1, tmp1);
10938     movq(Address(dst, 0), tmp1);
10939     subptr(len, 4);
10940     addptr(src, 4);
10941     addptr(dst, 8);
10942 
10943     bind(copy_bytes);
10944   }
10945   testl(len, len);
10946   jccb(Assembler::zero, done);
10947   lea(src, Address(src, len, Address::times_1));
10948   lea(dst, Address(dst, len, Address::times_2));
10949   negptr(len);
10950 
10951   // inflate 1 char per iter
10952   bind(copy_chars_loop);
10953   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
10954   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
10955   increment(len);
10956   jcc(Assembler::notZero, copy_chars_loop);
10957 
10958   bind(done);
10959 }
10960 
10961 
10962 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10963   switch (cond) {
10964     // Note some conditions are synonyms for others
10965     case Assembler::zero:         return Assembler::notZero;
10966     case Assembler::notZero:      return Assembler::zero;
10967     case Assembler::less:         return Assembler::greaterEqual;
10968     case Assembler::lessEqual:    return Assembler::greater;
10969     case Assembler::greater:      return Assembler::lessEqual;
10970     case Assembler::greaterEqual: return Assembler::less;
10971     case Assembler::below:        return Assembler::aboveEqual;
10972     case Assembler::belowEqual:   return Assembler::above;
10973     case Assembler::above:        return Assembler::belowEqual;
10974     case Assembler::aboveEqual:   return Assembler::below;
10975     case Assembler::overflow:     return Assembler::noOverflow;
10976     case Assembler::noOverflow:   return Assembler::overflow;
10977     case Assembler::negative:     return Assembler::positive;
10978     case Assembler::positive:     return Assembler::negative;
10979     case Assembler::parity:       return Assembler::noParity;
10980     case Assembler::noParity:     return Assembler::parity;




8238     }
8239     subl(result, cnt1);
8240     jmpb(POP_LABEL);
8241   }//if (VM_Version::supports_avx512vlbw())
8242 #endif // _LP64
8243 
8244   // Discard the stored length difference
8245   bind(POP_LABEL);
8246   pop(cnt1);
8247 
8248   // That's it
8249   bind(DONE_LABEL);
8250   if(ae == StrIntrinsicNode::UL) {
8251     negl(result);
8252   }
8253 
8254 }
8255 
8256 // Search for Non-ASCII character (Negative byte value) in a byte array,
8257 // return true if it has any and false otherwise.
8258 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
8259 //   @HotSpotIntrinsicCandidate
8260 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
8261 //     for (int i = off; i < off + len; i++) {
8262 //       if (ba[i] < 0) {
8263 //         return true;
8264 //       }
8265 //     }
8266 //     return false;
8267 //   }
8268 void MacroAssembler::has_negatives(Register ary1, Register len,
8269   Register result, Register tmp1,
8270   XMMRegister vec1, XMMRegister vec2) {

8271   // rsi: byte array
8272   // rcx: len
8273   // rax: result
8274   ShortBranchVerifier sbv(this);
8275   assert_different_registers(ary1, len, result, tmp1);
8276   assert_different_registers(vec1, vec2);
8277   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
8278 
8279   // len == 0
8280   testl(len, len);
8281   jcc(Assembler::zero, FALSE_LABEL);
8282 
8283   if ((UseAVX > 2) && // AVX512
8284     VM_Version::supports_avx512vlbw() &&
8285     VM_Version::supports_bmi2()) {
8286 
8287     set_vector_masking();  // opening of the stub context for programming mask registers
8288 
8289     Label test_64_loop, test_tail;
8290     Register tmp3_aliased = len;
8291 
8292     movl(tmp1, len);
8293     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
8294 
8295     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
8296     andl(len, ~(64 - 1));    // vector count (in chars)
8297     jccb(Assembler::zero, test_tail);
8298 
8299     lea(ary1, Address(ary1, len, Address::times_1));
8300     negptr(len);
8301 
8302     bind(test_64_loop);
8303     // Check whether our 64 elements of size byte contain negatives
8304     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
8305     kortestql(k2, k2);
8306     jcc(Assembler::notZero, TRUE_LABEL);
8307 
8308     addptr(len, 64);
8309     jccb(Assembler::notZero, test_64_loop);
8310 
8311 
8312     bind(test_tail);
8313     // bail out when there is nothing to be done
8314     testl(tmp1, -1);
8315     jcc(Assembler::zero, FALSE_LABEL);
8316 
8317     // Save k1
8318     kmovql(k3, k1);
8319 
8320     // ~(~0 << len) applied up to two times (for 32-bit scenario)
8321   #ifdef _LP64
8322       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
8323       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
8324       notq(tmp3_aliased);
8325       kmovql(k1, tmp3_aliased);
8326   #else
8327     Label k_init;
8328     jmp(k_init);
8329 
8330     // We could not read 64-bits from a general purpose register thus we move
8331     // data required to compose 64 1's to the instruction stream
8332     // We emit 64 byte wide series of elements from 0..63 which later on would
8333     // be used as a compare targets with tail count contained in tmp1 register.
8334     // Result would be a k1 register having tmp1 consecutive number or 1
8335     // counting from least significant bit.
8336     address tmp = pc();
8337     emit_int64(0x0706050403020100);
8338     emit_int64(0x0F0E0D0C0B0A0908);
8339     emit_int64(0x1716151413121110);
8340     emit_int64(0x1F1E1D1C1B1A1918);
8341     emit_int64(0x2726252423222120);
8342     emit_int64(0x2F2E2D2C2B2A2928);
8343     emit_int64(0x3736353433323130);
8344     emit_int64(0x3F3E3D3C3B3A3938);
8345 
8346     bind(k_init);
8347     lea(len, InternalAddress(tmp));
8348     // create mask to test for negative byte inside a vector
8349     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
8350     evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
8351 
8352 #endif
8353     evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
8354     ktestq(k2, k1);
8355     // Restore k1
8356     kmovql(k1, k3);
8357     jcc(Assembler::notZero, TRUE_LABEL);
8358 
8359     jmp(FALSE_LABEL);
8360 
8361     clear_vector_masking();   // closing of the stub context for programming mask registers
8362   } else {
8363     movl(result, len); // copy
8364 
8365     if (UseAVX == 2 && UseSSE >= 2) {
8366       // With AVX2, use 32-byte vector compare
8367       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8368 
8369       // Compare 32-byte vectors
8370       andl(result, 0x0000001f);  //   tail count (in bytes)
8371       andl(len, 0xffffffe0);   // vector count (in bytes)
8372       jccb(Assembler::zero, COMPARE_TAIL);
8373 
8374       lea(ary1, Address(ary1, len, Address::times_1));
8375       negptr(len);
8376 
8377       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
8378       movdl(vec2, tmp1);
8379       vpbroadcastd(vec2, vec2);
8380 
8381       bind(COMPARE_WIDE_VECTORS);
8382       vmovdqu(vec1, Address(ary1, len, Address::times_1));
8383       vptest(vec1, vec2);
8384       jccb(Assembler::notZero, TRUE_LABEL);
8385       addptr(len, 32);


8415 
8416       bind(COMPARE_WIDE_VECTORS);
8417       movdqu(vec1, Address(ary1, len, Address::times_1));
8418       ptest(vec1, vec2);
8419       jccb(Assembler::notZero, TRUE_LABEL);
8420       addptr(len, 16);
8421       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8422 
8423       testl(result, result);
8424       jccb(Assembler::zero, FALSE_LABEL);
8425 
8426       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8427       ptest(vec1, vec2);
8428       jccb(Assembler::notZero, TRUE_LABEL);
8429       jmpb(FALSE_LABEL);
8430 
8431       bind(COMPARE_TAIL); // len is zero
8432       movl(len, result);
8433       // Fallthru to tail compare
8434     }
8435   }
8436   // Compare 4-byte vectors
8437   andl(len, 0xfffffffc); // vector count (in bytes)
8438   jccb(Assembler::zero, COMPARE_CHAR);
8439 
8440   lea(ary1, Address(ary1, len, Address::times_1));
8441   negptr(len);
8442 
8443   bind(COMPARE_VECTORS);
8444   movl(tmp1, Address(ary1, len, Address::times_1));
8445   andl(tmp1, 0x80808080);
8446   jccb(Assembler::notZero, TRUE_LABEL);
8447   addptr(len, 4);
8448   jcc(Assembler::notZero, COMPARE_VECTORS);
8449 
8450   // Compare trailing char (final 2 bytes), if any
8451   bind(COMPARE_CHAR);
8452   testl(result, 0x2);   // tail  char
8453   jccb(Assembler::zero, COMPARE_BYTE);
8454   load_unsigned_short(tmp1, Address(ary1, 0));
8455   andl(tmp1, 0x00008080);


8909     testl(count, 1<<(shift-1));
8910     jccb(Assembler::zero, L_fill_byte);
8911     movw(Address(to, 0), value);
8912     if (t == T_BYTE) {
8913       addptr(to, 2);
8914       BIND(L_fill_byte);
8915       // fill trailing byte
8916       testl(count, 1);
8917       jccb(Assembler::zero, L_exit);
8918       movb(Address(to, 0), value);
8919     } else {
8920       BIND(L_fill_byte);
8921     }
8922   } else {
8923     BIND(L_fill_2_bytes);
8924   }
8925   BIND(L_exit);
8926 }
8927 
8928 // encode char[] to byte[] in ISO_8859_1
8929    //@HotSpotIntrinsicCandidate
8930    //private static int implEncodeISOArray(byte[] sa, int sp,
8931    //byte[] da, int dp, int len) {
8932    //  int i = 0;
8933    //  for (; i < len; i++) {
8934    //    char c = StringUTF16.getChar(sa, sp++);
8935    //    if (c > '\u00FF')
8936    //      break;
8937    //    da[dp++] = (byte)c;
8938    //  }
8939    //  return i;
8940    //}
8941 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8942   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8943   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8944   Register tmp5, Register result) {
8945 
8946   // rsi: src
8947   // rdi: dst
8948   // rdx: len
8949   // rcx: tmp5
8950   // rax: result
8951   ShortBranchVerifier sbv(this);
8952   assert_different_registers(src, dst, len, tmp5, result);
8953   Label L_done, L_copy_1_char, L_copy_1_char_exit;
8954 
8955   // set result
8956   xorl(result, result);
8957   // check for zero length
8958   testl(len, len);
8959   jcc(Assembler::zero, L_done);
8960 
8961   movl(result, len);
8962 
8963   // Setup pointers
8964   lea(src, Address(src, len, Address::times_2)); // char[]
8965   lea(dst, Address(dst, len, Address::times_1)); // byte[]
8966   negptr(len);
8967 
8968   if (UseSSE42Intrinsics || UseAVX >= 2) {
8969     assert(UseSSE42Intrinsics ? UseSSE >= 4 : true, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8970     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8971     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8972 
8973     if (UseAVX >= 2) {
8974       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8975       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8976       movdl(tmp1Reg, tmp5);
8977       vpbroadcastd(tmp1Reg, tmp1Reg);
8978       jmpb(L_chars_32_check);
8979 
8980       bind(L_copy_32_chars);


9049     packuswb(tmp3Reg, tmp1Reg);
9050     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
9051     addptr(len, 8);
9052     jccb(Assembler::lessEqual, L_copy_8_chars);
9053 
9054     bind(L_copy_8_chars_exit);
9055     subptr(len, 8);
9056     jccb(Assembler::zero, L_done);
9057   }
9058 
9059   bind(L_copy_1_char);
9060   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
9061   testl(tmp5, 0xff00);      // check if Unicode char
9062   jccb(Assembler::notZero, L_copy_1_char_exit);
9063   movb(Address(dst, len, Address::times_1, 0), tmp5);
9064   addptr(len, 1);
9065   jccb(Assembler::less, L_copy_1_char);
9066 
9067   bind(L_copy_1_char_exit);
9068   addptr(result, len); // len is negative count of not processed elements
9069 
9070   bind(L_done);
9071 }
9072 
9073 #ifdef _LP64
9074 /**
9075  * Helper for multiply_to_len().
9076  */
9077 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
9078   addq(dest_lo, src1);
9079   adcq(dest_hi, 0);
9080   addq(dest_lo, src2);
9081   adcq(dest_hi, 0);
9082 }
9083 
9084 /**
9085  * Multiply 64 bit by 64 bit first loop.
9086  */
9087 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
9088                                            Register y, Register y_idx, Register z,
9089                                            Register carry, Register product,


10870 
10871   BIND(L_byteByByteProlog);
10872   andl(in2, 0x00000007);
10873   movl(tmp2, 1);
10874 
10875   BIND(L_byteByByte);
10876   cmpl(tmp2, in2);
10877   jccb(Assembler::greater, L_exit);
10878     movb(tmp1, Address(in1, 0));
10879     crc32(in_out, tmp1, 1);
10880     incl(in1);
10881     incl(tmp2);
10882     jmp(L_byteByByte);
10883 
10884   BIND(L_exit);
10885 }
10886 #endif // LP64
10887 #undef BIND
10888 #undef BLOCK_COMMENT
10889 

10890 // Compress char[] array to byte[].
10891 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
10892 //   @HotSpotIntrinsicCandidate
10893 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
10894 //     for (int i = 0; i < len; i++) {
10895 //       int c = src[srcOff++];
10896 //       if (c >>> 8 != 0) {
10897 //         return 0;
10898 //       }
10899 //       dst[dstOff++] = (byte)c;
10900 //     }
10901 //     return len;
10902 //   }
10903 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10904   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10905   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10906   Register tmp5, Register result) {
10907   Label copy_chars_loop, return_length, return_zero, done, below_threshold;
10908 
10909   // rsi: src
10910   // rdi: dst
10911   // rdx: len
10912   // rcx: tmp5
10913   // rax: result
10914 
10915   // rsi holds start addr of source char[] to be compressed
10916   // rdi holds start addr of destination byte[]
10917   // rdx holds length
10918 
10919   assert(len != result, "");
10920 
10921   // save length for return
10922   push(len);
10923 
10924   if ((UseAVX > 2) && // AVX512
10925     VM_Version::supports_avx512vlbw() &&
10926     VM_Version::supports_bmi2()) {
10927 
10928     set_vector_masking();  // opening of the stub context for programming mask registers
10929 
10930     Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
10931 
10932     // alignement
10933     Label post_alignement;
10934 
10935     // if length of the string is less than 16, handle it in an old fashioned
10936     // way
10937     testl(len, -32);
10938     jcc(Assembler::zero, below_threshold);
10939 
10940     // First check whether a character is compressable ( <= 0xFF).
10941     // Create mask to test for Unicode chars inside zmm vector
10942     movl(result, 0x00FF);
10943     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
10944 
10945     testl(len, -64);
10946     jcc(Assembler::zero, post_alignement);
10947 
10948     // Save k1
10949     kmovql(k3, k1);
10950 
10951     movl(tmp5, dst);
10952     andl(tmp5, (64 - 1));
10953     negl(tmp5);
10954     andl(tmp5, (64 - 1));
10955 
10956     // bail out when there is nothing to be done
10957     testl(tmp5, 0xFFFFFFFF);
10958     jcc(Assembler::zero, post_alignement);
10959 
10960     // ~(~0 << len), where len is the # of remaining elements to process
10961     movl(result, 0xFFFFFFFF);
10962     shlxl(result, result, tmp5);
10963     notl(result);
10964 
10965     kmovdl(k1, result);
10966 
10967     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10968     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
10969     ktestd(k2, k1);
10970     jcc(Assembler::carryClear, copy_just_portion_of_candidates);
10971 
10972     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10973 
10974     addptr(src, tmp5);
10975     addptr(src, tmp5);
10976     addptr(dst, tmp5);
10977     subl(len, tmp5);
10978 
10979     bind(post_alignement);
10980     // end of alignement
10981 
10982     movl(tmp5, len);
10983     andl(tmp5, (32 - 1));   // tail count (in chars)
10984     andl(len, ~(32 - 1));    // vector count (in chars)
10985     jcc(Assembler::zero, copy_loop_tail);
10986 
10987     lea(src, Address(src, len, Address::times_2));
10988     lea(dst, Address(dst, len, Address::times_1));
10989     negptr(len);
10990 
10991     bind(copy_32_loop);
10992     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
10993     evpcmpuw(k2, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
10994     kortestdl(k2, k2);
10995     jcc(Assembler::carryClear, copy_just_portion_of_candidates);
10996 
10997     // All elements in current processed chunk are valid candidates for
10998     // compression. Write a truncated byte elements to the memory.
10999     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
11000     addptr(len, 32);
11001     jcc(Assembler::notZero, copy_32_loop);
11002 
11003     bind(copy_loop_tail);
11004     // bail out when there is nothing to be done
11005     testl(tmp5, 0xFFFFFFFF);
11006     jcc(Assembler::zero, return_length);
11007 
11008     // Save k1
11009     kmovql(k3, k1);
11010 
11011     movl(len, tmp5);
11012 
11013     // ~(~0 << len), where len is the # of remaining elements to process
11014     movl(result, 0xFFFFFFFF);
11015     shlxl(result, result, len);
11016     notl(result);
11017 
11018     kmovdl(k1, result);
11019 
11020     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
11021     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
11022     ktestd(k2, k1);
11023     jcc(Assembler::carryClear, copy_just_portion_of_candidates);
11024 
11025     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
11026     // Restore k1
11027     kmovql(k1, k3);
11028 
11029     jmp(return_length);
11030 
11031     bind(copy_just_portion_of_candidates);
11032     kmovdl(tmp5, k2);
11033     tzcntl(tmp5, tmp5);
11034 
11035     // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
11036     // result to the first element larger than 0xFF
11037     movl(result, 0xFFFFFFFF);
11038     shlxl(result, result, tmp5);
11039     notl(result);
11040 
11041     kmovdl(k1, result);
11042 
11043     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
11044     // Restore k1
11045     kmovql(k1, k3);
11046 
11047     jmp(return_zero);
11048 
11049     clear_vector_masking();   // closing of the stub context for programming mask registers
11050   }
11051   if (UseSSE42Intrinsics) {
11052     assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
11053     Label copy_32_loop, copy_16, copy_tail;
11054 
11055     bind(below_threshold);
11056 
11057     movl(result, len);
11058 
11059     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
11060 
11061     // vectored compression
11062     andl(len, 0xfffffff0);    // vector count (in chars)
11063     andl(result, 0x0000000f);    // tail count (in chars)
11064     testl(len, len);
11065     jccb(Assembler::zero, copy_16);
11066 
11067     // compress 16 chars per iter
11068     movdl(tmp1Reg, tmp5);
11069     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
11070     pxor(tmp4Reg, tmp4Reg);
11071 
11072     lea(src, Address(src, len, Address::times_2));
11073     lea(dst, Address(dst, len, Address::times_1));
11074     negptr(len);
11075 
11076     bind(copy_32_loop);
11077     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
11078     por(tmp4Reg, tmp2Reg);


11120   testl(result, 0xff00);      // check if Unicode char
11121   jccb(Assembler::notZero, return_zero);
11122   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
11123   increment(len);
11124   jcc(Assembler::notZero, copy_chars_loop);
11125 
11126   // if compression succeeded, return length
11127   bind(return_length);
11128   pop(result);
11129   jmpb(done);
11130 
11131   // if compression failed, return 0
11132   bind(return_zero);
11133   xorl(result, result);
11134   addptr(rsp, wordSize);
11135 
11136   bind(done);
11137 }
11138 
11139 // Inflate byte[] array to char[].
11140 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
11141 //   @HotSpotIntrinsicCandidate
11142 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
11143 //     for (int i = 0; i < len; i++) {
11144 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
11145 //     }
11146 //   }
11147 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
11148   XMMRegister tmp1, Register tmp2) {
11149   Label copy_chars_loop, done, below_threshold;

11150   // rsi: src
11151   // rdi: dst
11152   // rdx: len
11153   // rcx: tmp2
11154 
11155   // rsi holds start addr of source byte[] to be inflated
11156   // rdi holds start addr of destination char[]
11157   // rdx holds length
11158   assert_different_registers(src, dst, len, tmp2);
11159 
11160   if ((UseAVX > 2) && // AVX512
11161     VM_Version::supports_avx512vlbw() &&
11162     VM_Version::supports_bmi2()) {
11163 
11164     set_vector_masking();  // opening of the stub context for programming mask registers
11165 
11166     Label copy_32_loop, copy_tail;
11167     Register tmp3_aliased = len;
11168 
11169     // if length of the string is less than 16, handle it in an old fashioned
11170     // way
11171     testl(len, -16);
11172     jcc(Assembler::zero, below_threshold);
11173 
11174     // In order to use only one arithmetic operation for the main loop we use
11175     // this pre-calculation
11176     movl(tmp2, len);
11177     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
11178     andl(len, -32);     // vector count
11179     jccb(Assembler::zero, copy_tail);
11180 
11181     lea(src, Address(src, len, Address::times_1));
11182     lea(dst, Address(dst, len, Address::times_2));
11183     negptr(len);
11184 
11185 
11186     // inflate 32 chars per iter
11187     bind(copy_32_loop);
11188     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
11189     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
11190     addptr(len, 32);
11191     jcc(Assembler::notZero, copy_32_loop);
11192 
11193     bind(copy_tail);
11194     // bail out when there is nothing to be done
11195     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
11196     jcc(Assembler::zero, done);
11197 
11198     // Save k1
11199     kmovql(k2, k1);
11200 
11201     // ~(~0 << length), where length is the # of remaining elements to process
11202     movl(tmp3_aliased, -1);
11203     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
11204     notl(tmp3_aliased);
11205     kmovdl(k1, tmp3_aliased);
11206     evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
11207     evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
11208 
11209     // Restore k1
11210     kmovql(k1, k2);
11211     jmp(done);
11212 
11213     clear_vector_masking();   // closing of the stub context for programming mask registers
11214   }
11215   if (UseSSE42Intrinsics) {
11216     assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
11217     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
11218 
11219     movl(tmp2, len);
11220 
11221     if (UseAVX > 1) {
11222       andl(tmp2, (16 - 1));
11223       andl(len, -16);
11224       jccb(Assembler::zero, copy_new_tail);
11225     } else {
11226       andl(tmp2, 0x00000007);   // tail count (in chars)
11227       andl(len, 0xfffffff8);    // vector count (in chars)
11228       jccb(Assembler::zero, copy_tail);
11229     }
11230 
11231     // vectored inflation
11232     lea(src, Address(src, len, Address::times_1));
11233     lea(dst, Address(dst, len, Address::times_2));
11234     negptr(len);
11235 
11236     if (UseAVX > 1) {
11237       bind(copy_16_loop);
11238       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
11239       vmovdqu(Address(dst, len, Address::times_2), tmp1);
11240       addptr(len, 16);
11241       jcc(Assembler::notZero, copy_16_loop);
11242 
11243       bind(below_threshold);
11244       bind(copy_new_tail);
11245       if (UseAVX > 2) {
11246         movl(tmp2, len);
11247       }
11248       else {
11249         movl(len, tmp2);
11250       }
11251       andl(tmp2, 0x00000007);
11252       andl(len, 0xFFFFFFF8);
11253       jccb(Assembler::zero, copy_tail);
11254 
11255       pmovzxbw(tmp1, Address(src, 0));
11256       movdqu(Address(dst, 0), tmp1);
11257       addptr(src, 8);
11258       addptr(dst, 2 * 8);
11259 
11260       jmp(copy_tail, true);
11261     }
11262 
11263     // inflate 8 chars per iter
11264     bind(copy_8_loop);
11265     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
11266     movdqu(Address(dst, len, Address::times_2), tmp1);
11267     addptr(len, 8);
11268     jcc(Assembler::notZero, copy_8_loop);
11269 
11270     bind(copy_tail);
11271     movl(len, tmp2);
11272 
11273     cmpl(len, 4);
11274     jccb(Assembler::less, copy_bytes);
11275 
11276     movdl(tmp1, Address(src, 0));  // load 4 byte chars
11277     pmovzxbw(tmp1, tmp1);
11278     movq(Address(dst, 0), tmp1);
11279     subptr(len, 4);
11280     addptr(src, 4);
11281     addptr(dst, 8);
11282 
11283     bind(copy_bytes);
11284   }
11285   testl(len, len);
11286   jccb(Assembler::zero, done);
11287   lea(src, Address(src, len, Address::times_1));
11288   lea(dst, Address(dst, len, Address::times_2));
11289   negptr(len);
11290 
11291   // inflate 1 char per iter
11292   bind(copy_chars_loop);
11293   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
11294   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
11295   increment(len);
11296   jcc(Assembler::notZero, copy_chars_loop);
11297 
11298   bind(done);
11299 }

11300 
11301 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
11302   switch (cond) {
11303     // Note some conditions are synonyms for others
11304     case Assembler::zero:         return Assembler::notZero;
11305     case Assembler::notZero:      return Assembler::zero;
11306     case Assembler::less:         return Assembler::greaterEqual;
11307     case Assembler::lessEqual:    return Assembler::greater;
11308     case Assembler::greater:      return Assembler::lessEqual;
11309     case Assembler::greaterEqual: return Assembler::less;
11310     case Assembler::below:        return Assembler::aboveEqual;
11311     case Assembler::belowEqual:   return Assembler::above;
11312     case Assembler::above:        return Assembler::belowEqual;
11313     case Assembler::aboveEqual:   return Assembler::below;
11314     case Assembler::overflow:     return Assembler::noOverflow;
11315     case Assembler::noOverflow:   return Assembler::overflow;
11316     case Assembler::negative:     return Assembler::positive;
11317     case Assembler::positive:     return Assembler::negative;
11318     case Assembler::parity:       return Assembler::noParity;
11319     case Assembler::noParity:     return Assembler::parity;


< prev index next >