3965 // AVX 3-operands instructions
3966
3967 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3968 if (reachable(src)) {
3969 vaddsd(dst, nds, as_Address(src));
3970 } else {
3971 lea(rscratch1, src);
3972 vaddsd(dst, nds, Address(rscratch1, 0));
3973 }
3974 }
3975
3976 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3977 if (reachable(src)) {
3978 vaddss(dst, nds, as_Address(src));
3979 } else {
3980 lea(rscratch1, src);
3981 vaddss(dst, nds, Address(rscratch1, 0));
3982 }
3983 }
3984
3985 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
3986 if (reachable(src)) {
3987 vandpd(dst, nds, as_Address(src), vector256);
3988 } else {
3989 lea(rscratch1, src);
3990 vandpd(dst, nds, Address(rscratch1, 0), vector256);
3991 }
3992 }
3993
3994 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
3995 if (reachable(src)) {
3996 vandps(dst, nds, as_Address(src), vector256);
3997 } else {
3998 lea(rscratch1, src);
3999 vandps(dst, nds, Address(rscratch1, 0), vector256);
4000 }
4001 }
4002
4003 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4004 if (reachable(src)) {
4005 vdivsd(dst, nds, as_Address(src));
4006 } else {
4007 lea(rscratch1, src);
4008 vdivsd(dst, nds, Address(rscratch1, 0));
4009 }
4010 }
4011
4012 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4013 if (reachable(src)) {
4014 vdivss(dst, nds, as_Address(src));
4015 } else {
4016 lea(rscratch1, src);
4017 vdivss(dst, nds, Address(rscratch1, 0));
4018 }
4019 }
4037 }
4038
4039 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4040 if (reachable(src)) {
4041 vsubsd(dst, nds, as_Address(src));
4042 } else {
4043 lea(rscratch1, src);
4044 vsubsd(dst, nds, Address(rscratch1, 0));
4045 }
4046 }
4047
4048 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4049 if (reachable(src)) {
4050 vsubss(dst, nds, as_Address(src));
4051 } else {
4052 lea(rscratch1, src);
4053 vsubss(dst, nds, Address(rscratch1, 0));
4054 }
4055 }
4056
4057 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4058 if (reachable(src)) {
4059 vxorpd(dst, nds, as_Address(src), vector256);
4060 } else {
4061 lea(rscratch1, src);
4062 vxorpd(dst, nds, Address(rscratch1, 0), vector256);
4063 }
4064 }
4065
4066 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4067 if (reachable(src)) {
4068 vxorps(dst, nds, as_Address(src), vector256);
4069 } else {
4070 lea(rscratch1, src);
4071 vxorps(dst, nds, Address(rscratch1, 0), vector256);
4072 }
4073 }
4074
4075
4076 //////////////////////////////////////////////////////////////////////////////////
4077 #if INCLUDE_ALL_GCS
4078
4079 void MacroAssembler::g1_write_barrier_pre(Register obj,
4080 Register pre_val,
4081 Register thread,
4082 Register tmp,
4083 bool tosca_live,
4084 bool expand_call) {
4085
4086 // If expand_call is true then we expand the call_VM_leaf macro
4087 // directly to skip generating the check by
4088 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4089
4090 #ifdef _LP64
4091 assert(thread == r15_thread, "must be");
4530 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4531 #endif
4532 }
4533
4534 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4535 pusha();
4536
4537 // if we are coming from c1, xmm registers may be live
4538 int off = 0;
4539 if (UseSSE == 1) {
4540 subptr(rsp, sizeof(jdouble)*8);
4541 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4542 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4543 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4544 movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4545 movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4546 movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4547 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4548 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4549 } else if (UseSSE >= 2) {
4550 #ifdef COMPILER2
4551 if (MaxVectorSize > 16) {
4552 assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4553 // Save upper half of YMM registes
4554 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4555 vextractf128h(Address(rsp, 0),xmm0);
4556 vextractf128h(Address(rsp, 16),xmm1);
4557 vextractf128h(Address(rsp, 32),xmm2);
4558 vextractf128h(Address(rsp, 48),xmm3);
4559 vextractf128h(Address(rsp, 64),xmm4);
4560 vextractf128h(Address(rsp, 80),xmm5);
4561 vextractf128h(Address(rsp, 96),xmm6);
4562 vextractf128h(Address(rsp,112),xmm7);
4563 #ifdef _LP64
4564 vextractf128h(Address(rsp,128),xmm8);
4565 vextractf128h(Address(rsp,144),xmm9);
4566 vextractf128h(Address(rsp,160),xmm10);
4567 vextractf128h(Address(rsp,176),xmm11);
4568 vextractf128h(Address(rsp,192),xmm12);
4569 vextractf128h(Address(rsp,208),xmm13);
7022 movl(Address(to, 4), value);
7023 addptr(to, 8);
7024 BIND(L_fill_8_bytes);
7025 subl(count, 1 << (shift + 1));
7026 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7027 // fall through to fill 4 bytes
7028 } else {
7029 Label L_fill_32_bytes;
7030 if (!UseUnalignedLoadStores) {
7031 // align to 8 bytes, we know we are 4 byte aligned to start
7032 testptr(to, 4);
7033 jccb(Assembler::zero, L_fill_32_bytes);
7034 movl(Address(to, 0), value);
7035 addptr(to, 4);
7036 subl(count, 1<<shift);
7037 }
7038 BIND(L_fill_32_bytes);
7039 {
7040 assert( UseSSE >= 2, "supported cpu only" );
7041 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7042 movdl(xtmp, value);
7043 if (UseAVX >= 2 && UseUnalignedLoadStores) {
7044 // Fill 64-byte chunks
7045 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7046 vpbroadcastd(xtmp, xtmp);
7047
7048 subl(count, 16 << shift);
7049 jcc(Assembler::less, L_check_fill_32_bytes);
7050 align(16);
7051
7052 BIND(L_fill_64_bytes_loop);
7053 vmovdqu(Address(to, 0), xtmp);
7054 vmovdqu(Address(to, 32), xtmp);
7055 addptr(to, 64);
7056 subl(count, 16 << shift);
7057 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7058
7059 BIND(L_check_fill_32_bytes);
7060 addl(count, 8 << shift);
7061 jccb(Assembler::less, L_check_fill_8_bytes);
7062 vmovdqu(Address(to, 0), xtmp);
7063 addptr(to, 32);
7158
7159 // Setup pointers
7160 lea(src, Address(src, len, Address::times_2)); // char[]
7161 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7162 negptr(len);
7163
7164 if (UseSSE42Intrinsics || UseAVX >= 2) {
7165 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7166 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7167
7168 if (UseAVX >= 2) {
7169 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7170 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7171 movdl(tmp1Reg, tmp5);
7172 vpbroadcastd(tmp1Reg, tmp1Reg);
7173 jmpb(L_chars_32_check);
7174
7175 bind(L_copy_32_chars);
7176 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7177 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7178 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7179 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7180 jccb(Assembler::notZero, L_copy_32_chars_exit);
7181 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7182 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
7183 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7184
7185 bind(L_chars_32_check);
7186 addptr(len, 32);
7187 jccb(Assembler::lessEqual, L_copy_32_chars);
7188
7189 bind(L_copy_32_chars_exit);
7190 subptr(len, 16);
7191 jccb(Assembler::greater, L_copy_16_chars_exit);
7192
7193 } else if (UseSSE42Intrinsics) {
7194 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7195 movdl(tmp1Reg, tmp5);
7196 pshufd(tmp1Reg, tmp1Reg, 0);
7197 jmpb(L_chars_16_check);
7198 }
7199
7200 bind(L_copy_16_chars);
7201 if (UseAVX >= 2) {
7202 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7203 vptest(tmp2Reg, tmp1Reg);
7204 jccb(Assembler::notZero, L_copy_16_chars_exit);
7205 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
7206 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
7207 } else {
7208 if (UseAVX > 0) {
7209 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7210 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7211 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7212 } else {
7213 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7214 por(tmp2Reg, tmp3Reg);
7215 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7216 por(tmp2Reg, tmp4Reg);
7217 }
7218 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7219 jccb(Assembler::notZero, L_copy_16_chars_exit);
7220 packuswb(tmp3Reg, tmp4Reg);
7221 }
7222 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7223
7224 bind(L_chars_16_check);
7225 addptr(len, 16);
7226 jccb(Assembler::lessEqual, L_copy_16_chars);
7227
7228 bind(L_copy_16_chars_exit);
7229 if (UseAVX >= 2) {
7230 // clean upper bits of YMM registers
7231 vzeroupper();
7730 *
7731 * uint32_t crc;
7732 * val = crc_table[(val ^ crc) & 0xFF];
7733 * crc = val ^ (crc >> 8);
7734 *
7735 */
7736 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7737 xorl(val, crc);
7738 andl(val, 0xFF);
7739 shrl(crc, 8); // unsigned shift
7740 xorl(crc, Address(table, val, Address::times_4, 0));
7741 }
7742
7743 /**
7744 * Fold 128-bit data chunk
7745 */
7746 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7747 if (UseAVX > 0) {
7748 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7749 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7750 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
7751 pxor(xcrc, xtmp);
7752 } else {
7753 movdqa(xtmp, xcrc);
7754 pclmulhdq(xtmp, xK); // [123:64]
7755 pclmulldq(xcrc, xK); // [63:0]
7756 pxor(xcrc, xtmp);
7757 movdqu(xtmp, Address(buf, offset));
7758 pxor(xcrc, xtmp);
7759 }
7760 }
7761
7762 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7763 if (UseAVX > 0) {
7764 vpclmulhdq(xtmp, xK, xcrc);
7765 vpclmulldq(xcrc, xK, xcrc);
7766 pxor(xcrc, xbuf);
7767 pxor(xcrc, xtmp);
7768 } else {
7769 movdqa(xtmp, xcrc);
7770 pclmulhdq(xtmp, xK);
7874 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7875 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7876
7877 // Fold the rest of 128 bits data chunks
7878 BIND(L_fold_tail);
7879 addl(len, 3);
7880 jccb(Assembler::lessEqual, L_fold_128b);
7881 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7882
7883 BIND(L_fold_tail_loop);
7884 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7885 addptr(buf, 16);
7886 decrementl(len);
7887 jccb(Assembler::greater, L_fold_tail_loop);
7888
7889 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7890 BIND(L_fold_128b);
7891 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7892 if (UseAVX > 0) {
7893 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7894 vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7895 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7896 } else {
7897 movdqa(xmm2, xmm0);
7898 pclmulqdq(xmm2, xmm1, 0x1);
7899 movdqa(xmm3, xmm0);
7900 pand(xmm3, xmm2);
7901 pclmulqdq(xmm0, xmm3, 0x1);
7902 }
7903 psrldq(xmm1, 8);
7904 psrldq(xmm2, 4);
7905 pxor(xmm0, xmm1);
7906 pxor(xmm0, xmm2);
7907
7908 // 8 8-bit folds to compute 32-bit CRC.
7909 for (int j = 0; j < 4; j++) {
7910 fold_8bit_crc32(xmm0, table, xmm1, rax);
7911 }
7912 movdl(crc, xmm0); // mov 32 bits to general register
7913 for (int j = 0; j < 4; j++) {
7914 fold_8bit_crc32(crc, table, rax);
|
3965 // AVX 3-operands instructions
3966
3967 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3968 if (reachable(src)) {
3969 vaddsd(dst, nds, as_Address(src));
3970 } else {
3971 lea(rscratch1, src);
3972 vaddsd(dst, nds, Address(rscratch1, 0));
3973 }
3974 }
3975
3976 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3977 if (reachable(src)) {
3978 vaddss(dst, nds, as_Address(src));
3979 } else {
3980 lea(rscratch1, src);
3981 vaddss(dst, nds, Address(rscratch1, 0));
3982 }
3983 }
3984
3985 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
3986 if (reachable(src)) {
3987 vandpd(dst, nds, as_Address(src), vector_len);
3988 } else {
3989 lea(rscratch1, src);
3990 vandpd(dst, nds, Address(rscratch1, 0), vector_len);
3991 }
3992 }
3993
3994 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
3995 if (reachable(src)) {
3996 vandps(dst, nds, as_Address(src), vector_len);
3997 } else {
3998 lea(rscratch1, src);
3999 vandps(dst, nds, Address(rscratch1, 0), vector_len);
4000 }
4001 }
4002
4003 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4004 if (reachable(src)) {
4005 vdivsd(dst, nds, as_Address(src));
4006 } else {
4007 lea(rscratch1, src);
4008 vdivsd(dst, nds, Address(rscratch1, 0));
4009 }
4010 }
4011
4012 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4013 if (reachable(src)) {
4014 vdivss(dst, nds, as_Address(src));
4015 } else {
4016 lea(rscratch1, src);
4017 vdivss(dst, nds, Address(rscratch1, 0));
4018 }
4019 }
4037 }
4038
4039 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4040 if (reachable(src)) {
4041 vsubsd(dst, nds, as_Address(src));
4042 } else {
4043 lea(rscratch1, src);
4044 vsubsd(dst, nds, Address(rscratch1, 0));
4045 }
4046 }
4047
4048 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4049 if (reachable(src)) {
4050 vsubss(dst, nds, as_Address(src));
4051 } else {
4052 lea(rscratch1, src);
4053 vsubss(dst, nds, Address(rscratch1, 0));
4054 }
4055 }
4056
4057 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4058 if (reachable(src)) {
4059 vxorpd(dst, nds, as_Address(src), vector_len);
4060 } else {
4061 lea(rscratch1, src);
4062 vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4063 }
4064 }
4065
4066 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4067 if (reachable(src)) {
4068 vxorps(dst, nds, as_Address(src), vector_len);
4069 } else {
4070 lea(rscratch1, src);
4071 vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4072 }
4073 }
4074
4075
4076 //////////////////////////////////////////////////////////////////////////////////
4077 #if INCLUDE_ALL_GCS
4078
4079 void MacroAssembler::g1_write_barrier_pre(Register obj,
4080 Register pre_val,
4081 Register thread,
4082 Register tmp,
4083 bool tosca_live,
4084 bool expand_call) {
4085
4086 // If expand_call is true then we expand the call_VM_leaf macro
4087 // directly to skip generating the check by
4088 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4089
4090 #ifdef _LP64
4091 assert(thread == r15_thread, "must be");
4530 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4531 #endif
4532 }
4533
4534 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4535 pusha();
4536
4537 // if we are coming from c1, xmm registers may be live
4538 int off = 0;
4539 if (UseSSE == 1) {
4540 subptr(rsp, sizeof(jdouble)*8);
4541 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4542 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4543 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4544 movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4545 movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4546 movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4547 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4548 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4549 } else if (UseSSE >= 2) {
4550 if (UseAVX > 2) {
4551 movl(rbx, 0xffff);
4552 #ifdef _LP64
4553 kmovql(k1, rbx);
4554 #else
4555 kmovdl(k1, rbx);
4556 #endif
4557 }
4558 #ifdef COMPILER2
4559 if (MaxVectorSize > 16) {
4560 assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4561 // Save upper half of YMM registes
4562 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4563 vextractf128h(Address(rsp, 0),xmm0);
4564 vextractf128h(Address(rsp, 16),xmm1);
4565 vextractf128h(Address(rsp, 32),xmm2);
4566 vextractf128h(Address(rsp, 48),xmm3);
4567 vextractf128h(Address(rsp, 64),xmm4);
4568 vextractf128h(Address(rsp, 80),xmm5);
4569 vextractf128h(Address(rsp, 96),xmm6);
4570 vextractf128h(Address(rsp,112),xmm7);
4571 #ifdef _LP64
4572 vextractf128h(Address(rsp,128),xmm8);
4573 vextractf128h(Address(rsp,144),xmm9);
4574 vextractf128h(Address(rsp,160),xmm10);
4575 vextractf128h(Address(rsp,176),xmm11);
4576 vextractf128h(Address(rsp,192),xmm12);
4577 vextractf128h(Address(rsp,208),xmm13);
7030 movl(Address(to, 4), value);
7031 addptr(to, 8);
7032 BIND(L_fill_8_bytes);
7033 subl(count, 1 << (shift + 1));
7034 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7035 // fall through to fill 4 bytes
7036 } else {
7037 Label L_fill_32_bytes;
7038 if (!UseUnalignedLoadStores) {
7039 // align to 8 bytes, we know we are 4 byte aligned to start
7040 testptr(to, 4);
7041 jccb(Assembler::zero, L_fill_32_bytes);
7042 movl(Address(to, 0), value);
7043 addptr(to, 4);
7044 subl(count, 1<<shift);
7045 }
7046 BIND(L_fill_32_bytes);
7047 {
7048 assert( UseSSE >= 2, "supported cpu only" );
7049 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7050 if (UseAVX > 2) {
7051 movl(rtmp, 0xffff);
7052 #ifdef _LP64
7053 kmovql(k1, rtmp);
7054 #else
7055 kmovdl(k1, rtmp);
7056 #endif
7057 }
7058 movdl(xtmp, value);
7059 if (UseAVX > 2 && UseUnalignedLoadStores) {
7060 // Fill 64-byte chunks
7061 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7062 evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7063
7064 subl(count, 16 << shift);
7065 jcc(Assembler::less, L_check_fill_32_bytes);
7066 align(16);
7067
7068 BIND(L_fill_64_bytes_loop);
7069 evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
7070 addptr(to, 64);
7071 subl(count, 16 << shift);
7072 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7073
7074 BIND(L_check_fill_32_bytes);
7075 addl(count, 8 << shift);
7076 jccb(Assembler::less, L_check_fill_8_bytes);
7077 evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
7078 addptr(to, 32);
7079 subl(count, 8 << shift);
7080
7081 BIND(L_check_fill_8_bytes);
7082 } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7083 // Fill 64-byte chunks
7084 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7085 vpbroadcastd(xtmp, xtmp);
7086
7087 subl(count, 16 << shift);
7088 jcc(Assembler::less, L_check_fill_32_bytes);
7089 align(16);
7090
7091 BIND(L_fill_64_bytes_loop);
7092 vmovdqu(Address(to, 0), xtmp);
7093 vmovdqu(Address(to, 32), xtmp);
7094 addptr(to, 64);
7095 subl(count, 16 << shift);
7096 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7097
7098 BIND(L_check_fill_32_bytes);
7099 addl(count, 8 << shift);
7100 jccb(Assembler::less, L_check_fill_8_bytes);
7101 vmovdqu(Address(to, 0), xtmp);
7102 addptr(to, 32);
7197
7198 // Setup pointers
7199 lea(src, Address(src, len, Address::times_2)); // char[]
7200 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7201 negptr(len);
7202
7203 if (UseSSE42Intrinsics || UseAVX >= 2) {
7204 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7205 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7206
7207 if (UseAVX >= 2) {
7208 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7209 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7210 movdl(tmp1Reg, tmp5);
7211 vpbroadcastd(tmp1Reg, tmp1Reg);
7212 jmpb(L_chars_32_check);
7213
7214 bind(L_copy_32_chars);
7215 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7216 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7217 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7218 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7219 jccb(Assembler::notZero, L_copy_32_chars_exit);
7220 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7221 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7222 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7223
7224 bind(L_chars_32_check);
7225 addptr(len, 32);
7226 jccb(Assembler::lessEqual, L_copy_32_chars);
7227
7228 bind(L_copy_32_chars_exit);
7229 subptr(len, 16);
7230 jccb(Assembler::greater, L_copy_16_chars_exit);
7231
7232 } else if (UseSSE42Intrinsics) {
7233 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7234 movdl(tmp1Reg, tmp5);
7235 pshufd(tmp1Reg, tmp1Reg, 0);
7236 jmpb(L_chars_16_check);
7237 }
7238
7239 bind(L_copy_16_chars);
7240 if (UseAVX >= 2) {
7241 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7242 vptest(tmp2Reg, tmp1Reg);
7243 jccb(Assembler::notZero, L_copy_16_chars_exit);
7244 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7245 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7246 } else {
7247 if (UseAVX > 0) {
7248 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7249 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7250 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7251 } else {
7252 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7253 por(tmp2Reg, tmp3Reg);
7254 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7255 por(tmp2Reg, tmp4Reg);
7256 }
7257 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7258 jccb(Assembler::notZero, L_copy_16_chars_exit);
7259 packuswb(tmp3Reg, tmp4Reg);
7260 }
7261 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7262
7263 bind(L_chars_16_check);
7264 addptr(len, 16);
7265 jccb(Assembler::lessEqual, L_copy_16_chars);
7266
7267 bind(L_copy_16_chars_exit);
7268 if (UseAVX >= 2) {
7269 // clean upper bits of YMM registers
7270 vzeroupper();
7769 *
7770 * uint32_t crc;
7771 * val = crc_table[(val ^ crc) & 0xFF];
7772 * crc = val ^ (crc >> 8);
7773 *
7774 */
7775 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7776 xorl(val, crc);
7777 andl(val, 0xFF);
7778 shrl(crc, 8); // unsigned shift
7779 xorl(crc, Address(table, val, Address::times_4, 0));
7780 }
7781
7782 /**
7783 * Fold 128-bit data chunk
7784 */
7785 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7786 if (UseAVX > 0) {
7787 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7788 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7789 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7790 pxor(xcrc, xtmp);
7791 } else {
7792 movdqa(xtmp, xcrc);
7793 pclmulhdq(xtmp, xK); // [123:64]
7794 pclmulldq(xcrc, xK); // [63:0]
7795 pxor(xcrc, xtmp);
7796 movdqu(xtmp, Address(buf, offset));
7797 pxor(xcrc, xtmp);
7798 }
7799 }
7800
7801 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7802 if (UseAVX > 0) {
7803 vpclmulhdq(xtmp, xK, xcrc);
7804 vpclmulldq(xcrc, xK, xcrc);
7805 pxor(xcrc, xbuf);
7806 pxor(xcrc, xtmp);
7807 } else {
7808 movdqa(xtmp, xcrc);
7809 pclmulhdq(xtmp, xK);
7913 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7914 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7915
7916 // Fold the rest of 128 bits data chunks
7917 BIND(L_fold_tail);
7918 addl(len, 3);
7919 jccb(Assembler::lessEqual, L_fold_128b);
7920 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7921
7922 BIND(L_fold_tail_loop);
7923 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7924 addptr(buf, 16);
7925 decrementl(len);
7926 jccb(Assembler::greater, L_fold_tail_loop);
7927
7928 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7929 BIND(L_fold_128b);
7930 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7931 if (UseAVX > 0) {
7932 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7933 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7934 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7935 } else {
7936 movdqa(xmm2, xmm0);
7937 pclmulqdq(xmm2, xmm1, 0x1);
7938 movdqa(xmm3, xmm0);
7939 pand(xmm3, xmm2);
7940 pclmulqdq(xmm0, xmm3, 0x1);
7941 }
7942 psrldq(xmm1, 8);
7943 psrldq(xmm2, 4);
7944 pxor(xmm0, xmm1);
7945 pxor(xmm0, xmm2);
7946
7947 // 8 8-bit folds to compute 32-bit CRC.
7948 for (int j = 0; j < 4; j++) {
7949 fold_8bit_crc32(xmm0, table, xmm1, rax);
7950 }
7951 movdl(crc, xmm0); // mov 32 bits to general register
7952 for (int j = 0; j < 4; j++) {
7953 fold_8bit_crc32(crc, table, rax);
|