3979 // AVX 3-operands instructions
3980
3981 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3982 if (reachable(src)) {
3983 vaddsd(dst, nds, as_Address(src));
3984 } else {
3985 lea(rscratch1, src);
3986 vaddsd(dst, nds, Address(rscratch1, 0));
3987 }
3988 }
3989
3990 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3991 if (reachable(src)) {
3992 vaddss(dst, nds, as_Address(src));
3993 } else {
3994 lea(rscratch1, src);
3995 vaddss(dst, nds, Address(rscratch1, 0));
3996 }
3997 }
3998
3999 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4000 if (reachable(src)) {
4001 vandpd(dst, nds, as_Address(src), vector256);
4002 } else {
4003 lea(rscratch1, src);
4004 vandpd(dst, nds, Address(rscratch1, 0), vector256);
4005 }
4006 }
4007
4008 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4009 if (reachable(src)) {
4010 vandps(dst, nds, as_Address(src), vector256);
4011 } else {
4012 lea(rscratch1, src);
4013 vandps(dst, nds, Address(rscratch1, 0), vector256);
4014 }
4015 }
4016
4017 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4018 if (reachable(src)) {
4019 vdivsd(dst, nds, as_Address(src));
4020 } else {
4021 lea(rscratch1, src);
4022 vdivsd(dst, nds, Address(rscratch1, 0));
4023 }
4024 }
4025
4026 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4027 if (reachable(src)) {
4028 vdivss(dst, nds, as_Address(src));
4029 } else {
4030 lea(rscratch1, src);
4031 vdivss(dst, nds, Address(rscratch1, 0));
4032 }
4033 }
4051 }
4052
4053 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054 if (reachable(src)) {
4055 vsubsd(dst, nds, as_Address(src));
4056 } else {
4057 lea(rscratch1, src);
4058 vsubsd(dst, nds, Address(rscratch1, 0));
4059 }
4060 }
4061
4062 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4063 if (reachable(src)) {
4064 vsubss(dst, nds, as_Address(src));
4065 } else {
4066 lea(rscratch1, src);
4067 vsubss(dst, nds, Address(rscratch1, 0));
4068 }
4069 }
4070
4071 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4072 if (reachable(src)) {
4073 vxorpd(dst, nds, as_Address(src), vector256);
4074 } else {
4075 lea(rscratch1, src);
4076 vxorpd(dst, nds, Address(rscratch1, 0), vector256);
4077 }
4078 }
4079
4080 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4081 if (reachable(src)) {
4082 vxorps(dst, nds, as_Address(src), vector256);
4083 } else {
4084 lea(rscratch1, src);
4085 vxorps(dst, nds, Address(rscratch1, 0), vector256);
4086 }
4087 }
4088
4089
4090 //////////////////////////////////////////////////////////////////////////////////
4091 #if INCLUDE_ALL_GCS
4092
4093 void MacroAssembler::g1_write_barrier_pre(Register obj,
4094 Register pre_val,
4095 Register thread,
4096 Register tmp,
4097 bool tosca_live,
4098 bool expand_call) {
4099
4100 // If expand_call is true then we expand the call_VM_leaf macro
4101 // directly to skip generating the check by
4102 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4103
4104 #ifdef _LP64
4105 assert(thread == r15_thread, "must be");
4544 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4545 #endif
4546 }
4547
4548 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4549 pusha();
4550
4551 // if we are coming from c1, xmm registers may be live
4552 int off = 0;
4553 if (UseSSE == 1) {
4554 subptr(rsp, sizeof(jdouble)*8);
4555 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4556 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4557 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4558 movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4559 movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4560 movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4561 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4562 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4563 } else if (UseSSE >= 2) {
4564 #ifdef COMPILER2
4565 if (MaxVectorSize > 16) {
4566 assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4567 // Save upper half of YMM registes
4568 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4569 vextractf128h(Address(rsp, 0),xmm0);
4570 vextractf128h(Address(rsp, 16),xmm1);
4571 vextractf128h(Address(rsp, 32),xmm2);
4572 vextractf128h(Address(rsp, 48),xmm3);
4573 vextractf128h(Address(rsp, 64),xmm4);
4574 vextractf128h(Address(rsp, 80),xmm5);
4575 vextractf128h(Address(rsp, 96),xmm6);
4576 vextractf128h(Address(rsp,112),xmm7);
4577 #ifdef _LP64
4578 vextractf128h(Address(rsp,128),xmm8);
4579 vextractf128h(Address(rsp,144),xmm9);
4580 vextractf128h(Address(rsp,160),xmm10);
4581 vextractf128h(Address(rsp,176),xmm11);
4582 vextractf128h(Address(rsp,192),xmm12);
4583 vextractf128h(Address(rsp,208),xmm13);
7046 movl(Address(to, 4), value);
7047 addptr(to, 8);
7048 BIND(L_fill_8_bytes);
7049 subl(count, 1 << (shift + 1));
7050 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7051 // fall through to fill 4 bytes
7052 } else {
7053 Label L_fill_32_bytes;
7054 if (!UseUnalignedLoadStores) {
7055 // align to 8 bytes, we know we are 4 byte aligned to start
7056 testptr(to, 4);
7057 jccb(Assembler::zero, L_fill_32_bytes);
7058 movl(Address(to, 0), value);
7059 addptr(to, 4);
7060 subl(count, 1<<shift);
7061 }
7062 BIND(L_fill_32_bytes);
7063 {
7064 assert( UseSSE >= 2, "supported cpu only" );
7065 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7066 movdl(xtmp, value);
7067 if (UseAVX >= 2 && UseUnalignedLoadStores) {
7068 // Fill 64-byte chunks
7069 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7070 vpbroadcastd(xtmp, xtmp);
7071
7072 subl(count, 16 << shift);
7073 jcc(Assembler::less, L_check_fill_32_bytes);
7074 align(16);
7075
7076 BIND(L_fill_64_bytes_loop);
7077 vmovdqu(Address(to, 0), xtmp);
7078 vmovdqu(Address(to, 32), xtmp);
7079 addptr(to, 64);
7080 subl(count, 16 << shift);
7081 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7082
7083 BIND(L_check_fill_32_bytes);
7084 addl(count, 8 << shift);
7085 jccb(Assembler::less, L_check_fill_8_bytes);
7086 vmovdqu(Address(to, 0), xtmp);
7087 addptr(to, 32);
7183
7184 // Setup pointers
7185 lea(src, Address(src, len, Address::times_2)); // char[]
7186 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7187 negptr(len);
7188
7189 if (UseSSE42Intrinsics || UseAVX >= 2) {
7190 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7191 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7192
7193 if (UseAVX >= 2) {
7194 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7195 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7196 movdl(tmp1Reg, tmp5);
7197 vpbroadcastd(tmp1Reg, tmp1Reg);
7198 jmpb(L_chars_32_check);
7199
7200 bind(L_copy_32_chars);
7201 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7202 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7203 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7204 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7205 jccb(Assembler::notZero, L_copy_32_chars_exit);
7206 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7207 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
7208 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7209
7210 bind(L_chars_32_check);
7211 addptr(len, 32);
7212 jccb(Assembler::lessEqual, L_copy_32_chars);
7213
7214 bind(L_copy_32_chars_exit);
7215 subptr(len, 16);
7216 jccb(Assembler::greater, L_copy_16_chars_exit);
7217
7218 } else if (UseSSE42Intrinsics) {
7219 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7220 movdl(tmp1Reg, tmp5);
7221 pshufd(tmp1Reg, tmp1Reg, 0);
7222 jmpb(L_chars_16_check);
7223 }
7224
7225 bind(L_copy_16_chars);
7226 if (UseAVX >= 2) {
7227 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7228 vptest(tmp2Reg, tmp1Reg);
7229 jccb(Assembler::notZero, L_copy_16_chars_exit);
7230 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
7231 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
7232 } else {
7233 if (UseAVX > 0) {
7234 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7235 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7236 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7237 } else {
7238 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7239 por(tmp2Reg, tmp3Reg);
7240 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7241 por(tmp2Reg, tmp4Reg);
7242 }
7243 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7244 jccb(Assembler::notZero, L_copy_16_chars_exit);
7245 packuswb(tmp3Reg, tmp4Reg);
7246 }
7247 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7248
7249 bind(L_chars_16_check);
7250 addptr(len, 16);
7251 jccb(Assembler::lessEqual, L_copy_16_chars);
7252
7253 bind(L_copy_16_chars_exit);
7254 if (UseAVX >= 2) {
7255 // clean upper bits of YMM registers
7256 vpxor(tmp2Reg, tmp2Reg);
7759 *
7760 * uint32_t crc;
7761 * val = crc_table[(val ^ crc) & 0xFF];
7762 * crc = val ^ (crc >> 8);
7763 *
7764 */
7765 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7766 xorl(val, crc);
7767 andl(val, 0xFF);
7768 shrl(crc, 8); // unsigned shift
7769 xorl(crc, Address(table, val, Address::times_4, 0));
7770 }
7771
7772 /**
7773 * Fold 128-bit data chunk
7774 */
7775 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7776 if (UseAVX > 0) {
7777 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7778 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7779 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
7780 pxor(xcrc, xtmp);
7781 } else {
7782 movdqa(xtmp, xcrc);
7783 pclmulhdq(xtmp, xK); // [123:64]
7784 pclmulldq(xcrc, xK); // [63:0]
7785 pxor(xcrc, xtmp);
7786 movdqu(xtmp, Address(buf, offset));
7787 pxor(xcrc, xtmp);
7788 }
7789 }
7790
7791 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7792 if (UseAVX > 0) {
7793 vpclmulhdq(xtmp, xK, xcrc);
7794 vpclmulldq(xcrc, xK, xcrc);
7795 pxor(xcrc, xbuf);
7796 pxor(xcrc, xtmp);
7797 } else {
7798 movdqa(xtmp, xcrc);
7799 pclmulhdq(xtmp, xK);
7903 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7904 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7905
7906 // Fold the rest of 128 bits data chunks
7907 BIND(L_fold_tail);
7908 addl(len, 3);
7909 jccb(Assembler::lessEqual, L_fold_128b);
7910 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7911
7912 BIND(L_fold_tail_loop);
7913 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7914 addptr(buf, 16);
7915 decrementl(len);
7916 jccb(Assembler::greater, L_fold_tail_loop);
7917
7918 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7919 BIND(L_fold_128b);
7920 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7921 if (UseAVX > 0) {
7922 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7923 vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7924 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7925 } else {
7926 movdqa(xmm2, xmm0);
7927 pclmulqdq(xmm2, xmm1, 0x1);
7928 movdqa(xmm3, xmm0);
7929 pand(xmm3, xmm2);
7930 pclmulqdq(xmm0, xmm3, 0x1);
7931 }
7932 psrldq(xmm1, 8);
7933 psrldq(xmm2, 4);
7934 pxor(xmm0, xmm1);
7935 pxor(xmm0, xmm2);
7936
7937 // 8 8-bit folds to compute 32-bit CRC.
7938 for (int j = 0; j < 4; j++) {
7939 fold_8bit_crc32(xmm0, table, xmm1, rax);
7940 }
7941 movdl(crc, xmm0); // mov 32 bits to general register
7942 for (int j = 0; j < 4; j++) {
7943 fold_8bit_crc32(crc, table, rax);
|
3979 // AVX 3-operands instructions
3980
3981 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3982 if (reachable(src)) {
3983 vaddsd(dst, nds, as_Address(src));
3984 } else {
3985 lea(rscratch1, src);
3986 vaddsd(dst, nds, Address(rscratch1, 0));
3987 }
3988 }
3989
3990 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3991 if (reachable(src)) {
3992 vaddss(dst, nds, as_Address(src));
3993 } else {
3994 lea(rscratch1, src);
3995 vaddss(dst, nds, Address(rscratch1, 0));
3996 }
3997 }
3998
3999 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4000 if (reachable(src)) {
4001 vandpd(dst, nds, as_Address(src), vector_len);
4002 } else {
4003 lea(rscratch1, src);
4004 vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4005 }
4006 }
4007
4008 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4009 if (reachable(src)) {
4010 vandps(dst, nds, as_Address(src), vector_len);
4011 } else {
4012 lea(rscratch1, src);
4013 vandps(dst, nds, Address(rscratch1, 0), vector_len);
4014 }
4015 }
4016
4017 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4018 if (reachable(src)) {
4019 vdivsd(dst, nds, as_Address(src));
4020 } else {
4021 lea(rscratch1, src);
4022 vdivsd(dst, nds, Address(rscratch1, 0));
4023 }
4024 }
4025
4026 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4027 if (reachable(src)) {
4028 vdivss(dst, nds, as_Address(src));
4029 } else {
4030 lea(rscratch1, src);
4031 vdivss(dst, nds, Address(rscratch1, 0));
4032 }
4033 }
4051 }
4052
4053 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054 if (reachable(src)) {
4055 vsubsd(dst, nds, as_Address(src));
4056 } else {
4057 lea(rscratch1, src);
4058 vsubsd(dst, nds, Address(rscratch1, 0));
4059 }
4060 }
4061
4062 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4063 if (reachable(src)) {
4064 vsubss(dst, nds, as_Address(src));
4065 } else {
4066 lea(rscratch1, src);
4067 vsubss(dst, nds, Address(rscratch1, 0));
4068 }
4069 }
4070
4071 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4072 if (reachable(src)) {
4073 vxorpd(dst, nds, as_Address(src), vector_len);
4074 } else {
4075 lea(rscratch1, src);
4076 vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4077 }
4078 }
4079
4080 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4081 if (reachable(src)) {
4082 vxorps(dst, nds, as_Address(src), vector_len);
4083 } else {
4084 lea(rscratch1, src);
4085 vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4086 }
4087 }
4088
4089
4090 //////////////////////////////////////////////////////////////////////////////////
4091 #if INCLUDE_ALL_GCS
4092
4093 void MacroAssembler::g1_write_barrier_pre(Register obj,
4094 Register pre_val,
4095 Register thread,
4096 Register tmp,
4097 bool tosca_live,
4098 bool expand_call) {
4099
4100 // If expand_call is true then we expand the call_VM_leaf macro
4101 // directly to skip generating the check by
4102 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4103
4104 #ifdef _LP64
4105 assert(thread == r15_thread, "must be");
4544 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4545 #endif
4546 }
4547
4548 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4549 pusha();
4550
4551 // if we are coming from c1, xmm registers may be live
4552 int off = 0;
4553 if (UseSSE == 1) {
4554 subptr(rsp, sizeof(jdouble)*8);
4555 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4556 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4557 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4558 movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4559 movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4560 movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4561 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4562 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4563 } else if (UseSSE >= 2) {
4564 if (UseAVX > 2) {
4565 movl(rbx, 0xffff);
4566 #ifdef _LP64
4567 kmovql(k1, rbx);
4568 #else
4569 kmovdl(k1, rbx);
4570 #endif
4571 }
4572 #ifdef COMPILER2
4573 if (MaxVectorSize > 16) {
4574 assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4575 // Save upper half of YMM registes
4576 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4577 vextractf128h(Address(rsp, 0),xmm0);
4578 vextractf128h(Address(rsp, 16),xmm1);
4579 vextractf128h(Address(rsp, 32),xmm2);
4580 vextractf128h(Address(rsp, 48),xmm3);
4581 vextractf128h(Address(rsp, 64),xmm4);
4582 vextractf128h(Address(rsp, 80),xmm5);
4583 vextractf128h(Address(rsp, 96),xmm6);
4584 vextractf128h(Address(rsp,112),xmm7);
4585 #ifdef _LP64
4586 vextractf128h(Address(rsp,128),xmm8);
4587 vextractf128h(Address(rsp,144),xmm9);
4588 vextractf128h(Address(rsp,160),xmm10);
4589 vextractf128h(Address(rsp,176),xmm11);
4590 vextractf128h(Address(rsp,192),xmm12);
4591 vextractf128h(Address(rsp,208),xmm13);
7054 movl(Address(to, 4), value);
7055 addptr(to, 8);
7056 BIND(L_fill_8_bytes);
7057 subl(count, 1 << (shift + 1));
7058 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7059 // fall through to fill 4 bytes
7060 } else {
7061 Label L_fill_32_bytes;
7062 if (!UseUnalignedLoadStores) {
7063 // align to 8 bytes, we know we are 4 byte aligned to start
7064 testptr(to, 4);
7065 jccb(Assembler::zero, L_fill_32_bytes);
7066 movl(Address(to, 0), value);
7067 addptr(to, 4);
7068 subl(count, 1<<shift);
7069 }
7070 BIND(L_fill_32_bytes);
7071 {
7072 assert( UseSSE >= 2, "supported cpu only" );
7073 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7074 if (UseAVX > 2) {
7075 movl(rtmp, 0xffff);
7076 #ifdef _LP64
7077 kmovql(k1, rtmp);
7078 #else
7079 kmovdl(k1, rtmp);
7080 #endif
7081 }
7082 movdl(xtmp, value);
7083 if (UseAVX > 2 && UseUnalignedLoadStores) {
7084 // Fill 64-byte chunks
7085 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7086 evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7087
7088 subl(count, 16 << shift);
7089 jcc(Assembler::less, L_check_fill_32_bytes);
7090 align(16);
7091
7092 BIND(L_fill_64_bytes_loop);
7093 evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
7094 addptr(to, 64);
7095 subl(count, 16 << shift);
7096 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7097
7098 BIND(L_check_fill_32_bytes);
7099 addl(count, 8 << shift);
7100 jccb(Assembler::less, L_check_fill_8_bytes);
7101 evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
7102 addptr(to, 32);
7103 subl(count, 8 << shift);
7104
7105 BIND(L_check_fill_8_bytes);
7106 } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7107 // Fill 64-byte chunks
7108 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7109 vpbroadcastd(xtmp, xtmp);
7110
7111 subl(count, 16 << shift);
7112 jcc(Assembler::less, L_check_fill_32_bytes);
7113 align(16);
7114
7115 BIND(L_fill_64_bytes_loop);
7116 vmovdqu(Address(to, 0), xtmp);
7117 vmovdqu(Address(to, 32), xtmp);
7118 addptr(to, 64);
7119 subl(count, 16 << shift);
7120 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7121
7122 BIND(L_check_fill_32_bytes);
7123 addl(count, 8 << shift);
7124 jccb(Assembler::less, L_check_fill_8_bytes);
7125 vmovdqu(Address(to, 0), xtmp);
7126 addptr(to, 32);
7222
7223 // Setup pointers
7224 lea(src, Address(src, len, Address::times_2)); // char[]
7225 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7226 negptr(len);
7227
7228 if (UseSSE42Intrinsics || UseAVX >= 2) {
7229 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7230 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7231
7232 if (UseAVX >= 2) {
7233 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7234 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7235 movdl(tmp1Reg, tmp5);
7236 vpbroadcastd(tmp1Reg, tmp1Reg);
7237 jmpb(L_chars_32_check);
7238
7239 bind(L_copy_32_chars);
7240 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7241 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7242 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7243 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7244 jccb(Assembler::notZero, L_copy_32_chars_exit);
7245 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7246 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7247 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7248
7249 bind(L_chars_32_check);
7250 addptr(len, 32);
7251 jccb(Assembler::lessEqual, L_copy_32_chars);
7252
7253 bind(L_copy_32_chars_exit);
7254 subptr(len, 16);
7255 jccb(Assembler::greater, L_copy_16_chars_exit);
7256
7257 } else if (UseSSE42Intrinsics) {
7258 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7259 movdl(tmp1Reg, tmp5);
7260 pshufd(tmp1Reg, tmp1Reg, 0);
7261 jmpb(L_chars_16_check);
7262 }
7263
7264 bind(L_copy_16_chars);
7265 if (UseAVX >= 2) {
7266 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7267 vptest(tmp2Reg, tmp1Reg);
7268 jccb(Assembler::notZero, L_copy_16_chars_exit);
7269 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7270 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7271 } else {
7272 if (UseAVX > 0) {
7273 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7274 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7275 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7276 } else {
7277 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7278 por(tmp2Reg, tmp3Reg);
7279 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7280 por(tmp2Reg, tmp4Reg);
7281 }
7282 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7283 jccb(Assembler::notZero, L_copy_16_chars_exit);
7284 packuswb(tmp3Reg, tmp4Reg);
7285 }
7286 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7287
7288 bind(L_chars_16_check);
7289 addptr(len, 16);
7290 jccb(Assembler::lessEqual, L_copy_16_chars);
7291
7292 bind(L_copy_16_chars_exit);
7293 if (UseAVX >= 2) {
7294 // clean upper bits of YMM registers
7295 vpxor(tmp2Reg, tmp2Reg);
7798 *
7799 * uint32_t crc;
7800 * val = crc_table[(val ^ crc) & 0xFF];
7801 * crc = val ^ (crc >> 8);
7802 *
7803 */
7804 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7805 xorl(val, crc);
7806 andl(val, 0xFF);
7807 shrl(crc, 8); // unsigned shift
7808 xorl(crc, Address(table, val, Address::times_4, 0));
7809 }
7810
7811 /**
7812 * Fold 128-bit data chunk
7813 */
7814 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7815 if (UseAVX > 0) {
7816 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7817 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7818 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7819 pxor(xcrc, xtmp);
7820 } else {
7821 movdqa(xtmp, xcrc);
7822 pclmulhdq(xtmp, xK); // [123:64]
7823 pclmulldq(xcrc, xK); // [63:0]
7824 pxor(xcrc, xtmp);
7825 movdqu(xtmp, Address(buf, offset));
7826 pxor(xcrc, xtmp);
7827 }
7828 }
7829
7830 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7831 if (UseAVX > 0) {
7832 vpclmulhdq(xtmp, xK, xcrc);
7833 vpclmulldq(xcrc, xK, xcrc);
7834 pxor(xcrc, xbuf);
7835 pxor(xcrc, xtmp);
7836 } else {
7837 movdqa(xtmp, xcrc);
7838 pclmulhdq(xtmp, xK);
7942 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7943 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7944
7945 // Fold the rest of 128 bits data chunks
7946 BIND(L_fold_tail);
7947 addl(len, 3);
7948 jccb(Assembler::lessEqual, L_fold_128b);
7949 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7950
7951 BIND(L_fold_tail_loop);
7952 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7953 addptr(buf, 16);
7954 decrementl(len);
7955 jccb(Assembler::greater, L_fold_tail_loop);
7956
7957 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7958 BIND(L_fold_128b);
7959 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7960 if (UseAVX > 0) {
7961 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7962 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7963 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7964 } else {
7965 movdqa(xmm2, xmm0);
7966 pclmulqdq(xmm2, xmm1, 0x1);
7967 movdqa(xmm3, xmm0);
7968 pand(xmm3, xmm2);
7969 pclmulqdq(xmm0, xmm3, 0x1);
7970 }
7971 psrldq(xmm1, 8);
7972 psrldq(xmm2, 4);
7973 pxor(xmm0, xmm1);
7974 pxor(xmm0, xmm2);
7975
7976 // 8 8-bit folds to compute 32-bit CRC.
7977 for (int j = 0; j < 4; j++) {
7978 fold_8bit_crc32(xmm0, table, xmm1, rax);
7979 }
7980 movdl(crc, xmm0); // mov 32 bits to general register
7981 for (int j = 0; j < 4; j++) {
7982 fold_8bit_crc32(crc, table, rax);
|