src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 8076276 Sdiff src/cpu/x86/vm

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page
rev 8344 : 8076276: Add support for AVX512
Reviewed-by: kvn, roland
Contributed-by: michael.c.berg@intel.com


3979 // AVX 3-operands instructions
3980 
3981 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3982   if (reachable(src)) {
3983     vaddsd(dst, nds, as_Address(src));
3984   } else {
3985     lea(rscratch1, src);
3986     vaddsd(dst, nds, Address(rscratch1, 0));
3987   }
3988 }
3989 
3990 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3991   if (reachable(src)) {
3992     vaddss(dst, nds, as_Address(src));
3993   } else {
3994     lea(rscratch1, src);
3995     vaddss(dst, nds, Address(rscratch1, 0));
3996   }
3997 }
3998 
3999 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4000   if (reachable(src)) {
4001     vandpd(dst, nds, as_Address(src), vector256);
4002   } else {
4003     lea(rscratch1, src);
4004     vandpd(dst, nds, Address(rscratch1, 0), vector256);
4005   }
4006 }
4007 
4008 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4009   if (reachable(src)) {
4010     vandps(dst, nds, as_Address(src), vector256);
4011   } else {
4012     lea(rscratch1, src);
4013     vandps(dst, nds, Address(rscratch1, 0), vector256);
4014   }
4015 }
4016 
4017 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4018   if (reachable(src)) {
4019     vdivsd(dst, nds, as_Address(src));
4020   } else {
4021     lea(rscratch1, src);
4022     vdivsd(dst, nds, Address(rscratch1, 0));
4023   }
4024 }
4025 
4026 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4027   if (reachable(src)) {
4028     vdivss(dst, nds, as_Address(src));
4029   } else {
4030     lea(rscratch1, src);
4031     vdivss(dst, nds, Address(rscratch1, 0));
4032   }
4033 }


4051 }
4052 
4053 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054   if (reachable(src)) {
4055     vsubsd(dst, nds, as_Address(src));
4056   } else {
4057     lea(rscratch1, src);
4058     vsubsd(dst, nds, Address(rscratch1, 0));
4059   }
4060 }
4061 
4062 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4063   if (reachable(src)) {
4064     vsubss(dst, nds, as_Address(src));
4065   } else {
4066     lea(rscratch1, src);
4067     vsubss(dst, nds, Address(rscratch1, 0));
4068   }
4069 }
4070 
4071 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4072   if (reachable(src)) {
4073     vxorpd(dst, nds, as_Address(src), vector256);
4074   } else {
4075     lea(rscratch1, src);
4076     vxorpd(dst, nds, Address(rscratch1, 0), vector256);
4077   }
4078 }
4079 
4080 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4081   if (reachable(src)) {
4082     vxorps(dst, nds, as_Address(src), vector256);
4083   } else {
4084     lea(rscratch1, src);
4085     vxorps(dst, nds, Address(rscratch1, 0), vector256);
4086   }
4087 }
4088 
4089 
4090 //////////////////////////////////////////////////////////////////////////////////
4091 #if INCLUDE_ALL_GCS
4092 
4093 void MacroAssembler::g1_write_barrier_pre(Register obj,
4094                                           Register pre_val,
4095                                           Register thread,
4096                                           Register tmp,
4097                                           bool tosca_live,
4098                                           bool expand_call) {
4099 
4100   // If expand_call is true then we expand the call_VM_leaf macro
4101   // directly to skip generating the check by
4102   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4103 
4104 #ifdef _LP64
4105   assert(thread == r15_thread, "must be");


4544   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4545 #endif
4546 }
4547 
4548 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4549   pusha();
4550 
4551   // if we are coming from c1, xmm registers may be live
4552   int off = 0;
4553   if (UseSSE == 1)  {
4554     subptr(rsp, sizeof(jdouble)*8);
4555     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4556     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4557     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4558     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4559     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4560     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4561     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4562     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4563   } else if (UseSSE >= 2)  {








4564 #ifdef COMPILER2
4565     if (MaxVectorSize > 16) {
4566       assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4567       // Save upper half of YMM registes
4568       subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4569       vextractf128h(Address(rsp,  0),xmm0);
4570       vextractf128h(Address(rsp, 16),xmm1);
4571       vextractf128h(Address(rsp, 32),xmm2);
4572       vextractf128h(Address(rsp, 48),xmm3);
4573       vextractf128h(Address(rsp, 64),xmm4);
4574       vextractf128h(Address(rsp, 80),xmm5);
4575       vextractf128h(Address(rsp, 96),xmm6);
4576       vextractf128h(Address(rsp,112),xmm7);
4577 #ifdef _LP64
4578       vextractf128h(Address(rsp,128),xmm8);
4579       vextractf128h(Address(rsp,144),xmm9);
4580       vextractf128h(Address(rsp,160),xmm10);
4581       vextractf128h(Address(rsp,176),xmm11);
4582       vextractf128h(Address(rsp,192),xmm12);
4583       vextractf128h(Address(rsp,208),xmm13);


7046     movl(Address(to, 4), value);
7047     addptr(to, 8);
7048     BIND(L_fill_8_bytes);
7049     subl(count, 1 << (shift + 1));
7050     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7051     // fall through to fill 4 bytes
7052   } else {
7053     Label L_fill_32_bytes;
7054     if (!UseUnalignedLoadStores) {
7055       // align to 8 bytes, we know we are 4 byte aligned to start
7056       testptr(to, 4);
7057       jccb(Assembler::zero, L_fill_32_bytes);
7058       movl(Address(to, 0), value);
7059       addptr(to, 4);
7060       subl(count, 1<<shift);
7061     }
7062     BIND(L_fill_32_bytes);
7063     {
7064       assert( UseSSE >= 2, "supported cpu only" );
7065       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;








7066       movdl(xtmp, value);
7067       if (UseAVX >= 2 && UseUnalignedLoadStores) {























7068         // Fill 64-byte chunks
7069         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7070         vpbroadcastd(xtmp, xtmp);
7071 
7072         subl(count, 16 << shift);
7073         jcc(Assembler::less, L_check_fill_32_bytes);
7074         align(16);
7075 
7076         BIND(L_fill_64_bytes_loop);
7077         vmovdqu(Address(to, 0), xtmp);
7078         vmovdqu(Address(to, 32), xtmp);
7079         addptr(to, 64);
7080         subl(count, 16 << shift);
7081         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7082 
7083         BIND(L_check_fill_32_bytes);
7084         addl(count, 8 << shift);
7085         jccb(Assembler::less, L_check_fill_8_bytes);
7086         vmovdqu(Address(to, 0), xtmp);
7087         addptr(to, 32);


7183 
7184   // Setup pointers
7185   lea(src, Address(src, len, Address::times_2)); // char[]
7186   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7187   negptr(len);
7188 
7189   if (UseSSE42Intrinsics || UseAVX >= 2) {
7190     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7191     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7192 
7193     if (UseAVX >= 2) {
7194       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7195       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7196       movdl(tmp1Reg, tmp5);
7197       vpbroadcastd(tmp1Reg, tmp1Reg);
7198       jmpb(L_chars_32_check);
7199 
7200       bind(L_copy_32_chars);
7201       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7202       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7203       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7204       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7205       jccb(Assembler::notZero, L_copy_32_chars_exit);
7206       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7207       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
7208       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7209 
7210       bind(L_chars_32_check);
7211       addptr(len, 32);
7212       jccb(Assembler::lessEqual, L_copy_32_chars);
7213 
7214       bind(L_copy_32_chars_exit);
7215       subptr(len, 16);
7216       jccb(Assembler::greater, L_copy_16_chars_exit);
7217 
7218     } else if (UseSSE42Intrinsics) {
7219       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7220       movdl(tmp1Reg, tmp5);
7221       pshufd(tmp1Reg, tmp1Reg, 0);
7222       jmpb(L_chars_16_check);
7223     }
7224 
7225     bind(L_copy_16_chars);
7226     if (UseAVX >= 2) {
7227       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7228       vptest(tmp2Reg, tmp1Reg);
7229       jccb(Assembler::notZero, L_copy_16_chars_exit);
7230       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
7231       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
7232     } else {
7233       if (UseAVX > 0) {
7234         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7235         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7236         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7237       } else {
7238         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7239         por(tmp2Reg, tmp3Reg);
7240         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7241         por(tmp2Reg, tmp4Reg);
7242       }
7243       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7244       jccb(Assembler::notZero, L_copy_16_chars_exit);
7245       packuswb(tmp3Reg, tmp4Reg);
7246     }
7247     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7248 
7249     bind(L_chars_16_check);
7250     addptr(len, 16);
7251     jccb(Assembler::lessEqual, L_copy_16_chars);
7252 
7253     bind(L_copy_16_chars_exit);
7254     if (UseAVX >= 2) {
7255       // clean upper bits of YMM registers
7256       vpxor(tmp2Reg, tmp2Reg);


7759  *
7760  * uint32_t crc;
7761  * val = crc_table[(val ^ crc) & 0xFF];
7762  * crc = val ^ (crc >> 8);
7763  *
7764  */
7765 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7766   xorl(val, crc);
7767   andl(val, 0xFF);
7768   shrl(crc, 8); // unsigned shift
7769   xorl(crc, Address(table, val, Address::times_4, 0));
7770 }
7771 
7772 /**
7773  * Fold 128-bit data chunk
7774  */
7775 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7776   if (UseAVX > 0) {
7777     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7778     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7779     vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
7780     pxor(xcrc, xtmp);
7781   } else {
7782     movdqa(xtmp, xcrc);
7783     pclmulhdq(xtmp, xK);   // [123:64]
7784     pclmulldq(xcrc, xK);   // [63:0]
7785     pxor(xcrc, xtmp);
7786     movdqu(xtmp, Address(buf, offset));
7787     pxor(xcrc, xtmp);
7788   }
7789 }
7790 
7791 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7792   if (UseAVX > 0) {
7793     vpclmulhdq(xtmp, xK, xcrc);
7794     vpclmulldq(xcrc, xK, xcrc);
7795     pxor(xcrc, xbuf);
7796     pxor(xcrc, xtmp);
7797   } else {
7798     movdqa(xtmp, xcrc);
7799     pclmulhdq(xtmp, xK);


7903   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7904   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7905 
7906   // Fold the rest of 128 bits data chunks
7907   BIND(L_fold_tail);
7908   addl(len, 3);
7909   jccb(Assembler::lessEqual, L_fold_128b);
7910   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7911 
7912   BIND(L_fold_tail_loop);
7913   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7914   addptr(buf, 16);
7915   decrementl(len);
7916   jccb(Assembler::greater, L_fold_tail_loop);
7917 
7918   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7919   BIND(L_fold_128b);
7920   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7921   if (UseAVX > 0) {
7922     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7923     vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7924     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7925   } else {
7926     movdqa(xmm2, xmm0);
7927     pclmulqdq(xmm2, xmm1, 0x1);
7928     movdqa(xmm3, xmm0);
7929     pand(xmm3, xmm2);
7930     pclmulqdq(xmm0, xmm3, 0x1);
7931   }
7932   psrldq(xmm1, 8);
7933   psrldq(xmm2, 4);
7934   pxor(xmm0, xmm1);
7935   pxor(xmm0, xmm2);
7936 
7937   // 8 8-bit folds to compute 32-bit CRC.
7938   for (int j = 0; j < 4; j++) {
7939     fold_8bit_crc32(xmm0, table, xmm1, rax);
7940   }
7941   movdl(crc, xmm0); // mov 32 bits to general register
7942   for (int j = 0; j < 4; j++) {
7943     fold_8bit_crc32(crc, table, rax);




3979 // AVX 3-operands instructions
3980 
3981 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3982   if (reachable(src)) {
3983     vaddsd(dst, nds, as_Address(src));
3984   } else {
3985     lea(rscratch1, src);
3986     vaddsd(dst, nds, Address(rscratch1, 0));
3987   }
3988 }
3989 
3990 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3991   if (reachable(src)) {
3992     vaddss(dst, nds, as_Address(src));
3993   } else {
3994     lea(rscratch1, src);
3995     vaddss(dst, nds, Address(rscratch1, 0));
3996   }
3997 }
3998 
3999 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4000   if (reachable(src)) {
4001     vandpd(dst, nds, as_Address(src), vector_len);
4002   } else {
4003     lea(rscratch1, src);
4004     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4005   }
4006 }
4007 
4008 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4009   if (reachable(src)) {
4010     vandps(dst, nds, as_Address(src), vector_len);
4011   } else {
4012     lea(rscratch1, src);
4013     vandps(dst, nds, Address(rscratch1, 0), vector_len);
4014   }
4015 }
4016 
4017 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4018   if (reachable(src)) {
4019     vdivsd(dst, nds, as_Address(src));
4020   } else {
4021     lea(rscratch1, src);
4022     vdivsd(dst, nds, Address(rscratch1, 0));
4023   }
4024 }
4025 
4026 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4027   if (reachable(src)) {
4028     vdivss(dst, nds, as_Address(src));
4029   } else {
4030     lea(rscratch1, src);
4031     vdivss(dst, nds, Address(rscratch1, 0));
4032   }
4033 }


4051 }
4052 
4053 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054   if (reachable(src)) {
4055     vsubsd(dst, nds, as_Address(src));
4056   } else {
4057     lea(rscratch1, src);
4058     vsubsd(dst, nds, Address(rscratch1, 0));
4059   }
4060 }
4061 
4062 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4063   if (reachable(src)) {
4064     vsubss(dst, nds, as_Address(src));
4065   } else {
4066     lea(rscratch1, src);
4067     vsubss(dst, nds, Address(rscratch1, 0));
4068   }
4069 }
4070 
4071 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4072   if (reachable(src)) {
4073     vxorpd(dst, nds, as_Address(src), vector_len);
4074   } else {
4075     lea(rscratch1, src);
4076     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4077   }
4078 }
4079 
4080 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4081   if (reachable(src)) {
4082     vxorps(dst, nds, as_Address(src), vector_len);
4083   } else {
4084     lea(rscratch1, src);
4085     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4086   }
4087 }
4088 
4089 
4090 //////////////////////////////////////////////////////////////////////////////////
4091 #if INCLUDE_ALL_GCS
4092 
4093 void MacroAssembler::g1_write_barrier_pre(Register obj,
4094                                           Register pre_val,
4095                                           Register thread,
4096                                           Register tmp,
4097                                           bool tosca_live,
4098                                           bool expand_call) {
4099 
4100   // If expand_call is true then we expand the call_VM_leaf macro
4101   // directly to skip generating the check by
4102   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4103 
4104 #ifdef _LP64
4105   assert(thread == r15_thread, "must be");


4544   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4545 #endif
4546 }
4547 
4548 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4549   pusha();
4550 
4551   // if we are coming from c1, xmm registers may be live
4552   int off = 0;
4553   if (UseSSE == 1)  {
4554     subptr(rsp, sizeof(jdouble)*8);
4555     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4556     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4557     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4558     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4559     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4560     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4561     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4562     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4563   } else if (UseSSE >= 2)  {
4564     if (UseAVX > 2) {
4565       movl(rbx, 0xffff);
4566 #ifdef _LP64
4567       kmovql(k1, rbx);
4568 #else
4569       kmovdl(k1, rbx);
4570 #endif
4571     }
4572 #ifdef COMPILER2
4573     if (MaxVectorSize > 16) {
4574       assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4575       // Save upper half of YMM registes
4576       subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4577       vextractf128h(Address(rsp,  0),xmm0);
4578       vextractf128h(Address(rsp, 16),xmm1);
4579       vextractf128h(Address(rsp, 32),xmm2);
4580       vextractf128h(Address(rsp, 48),xmm3);
4581       vextractf128h(Address(rsp, 64),xmm4);
4582       vextractf128h(Address(rsp, 80),xmm5);
4583       vextractf128h(Address(rsp, 96),xmm6);
4584       vextractf128h(Address(rsp,112),xmm7);
4585 #ifdef _LP64
4586       vextractf128h(Address(rsp,128),xmm8);
4587       vextractf128h(Address(rsp,144),xmm9);
4588       vextractf128h(Address(rsp,160),xmm10);
4589       vextractf128h(Address(rsp,176),xmm11);
4590       vextractf128h(Address(rsp,192),xmm12);
4591       vextractf128h(Address(rsp,208),xmm13);


7054     movl(Address(to, 4), value);
7055     addptr(to, 8);
7056     BIND(L_fill_8_bytes);
7057     subl(count, 1 << (shift + 1));
7058     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7059     // fall through to fill 4 bytes
7060   } else {
7061     Label L_fill_32_bytes;
7062     if (!UseUnalignedLoadStores) {
7063       // align to 8 bytes, we know we are 4 byte aligned to start
7064       testptr(to, 4);
7065       jccb(Assembler::zero, L_fill_32_bytes);
7066       movl(Address(to, 0), value);
7067       addptr(to, 4);
7068       subl(count, 1<<shift);
7069     }
7070     BIND(L_fill_32_bytes);
7071     {
7072       assert( UseSSE >= 2, "supported cpu only" );
7073       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7074       if (UseAVX > 2) {
7075         movl(rtmp, 0xffff);
7076 #ifdef _LP64
7077         kmovql(k1, rtmp);
7078 #else
7079         kmovdl(k1, rtmp);
7080 #endif
7081       }
7082       movdl(xtmp, value);
7083       if (UseAVX > 2 && UseUnalignedLoadStores) {
7084         // Fill 64-byte chunks
7085         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7086         evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7087 
7088         subl(count, 16 << shift);
7089         jcc(Assembler::less, L_check_fill_32_bytes);
7090         align(16);
7091 
7092         BIND(L_fill_64_bytes_loop);
7093         evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
7094         addptr(to, 64);
7095         subl(count, 16 << shift);
7096         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7097 
7098         BIND(L_check_fill_32_bytes);
7099         addl(count, 8 << shift);
7100         jccb(Assembler::less, L_check_fill_8_bytes);
7101         evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
7102         addptr(to, 32);
7103         subl(count, 8 << shift);
7104 
7105         BIND(L_check_fill_8_bytes);
7106       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7107         // Fill 64-byte chunks
7108         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7109         vpbroadcastd(xtmp, xtmp);
7110 
7111         subl(count, 16 << shift);
7112         jcc(Assembler::less, L_check_fill_32_bytes);
7113         align(16);
7114 
7115         BIND(L_fill_64_bytes_loop);
7116         vmovdqu(Address(to, 0), xtmp);
7117         vmovdqu(Address(to, 32), xtmp);
7118         addptr(to, 64);
7119         subl(count, 16 << shift);
7120         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7121 
7122         BIND(L_check_fill_32_bytes);
7123         addl(count, 8 << shift);
7124         jccb(Assembler::less, L_check_fill_8_bytes);
7125         vmovdqu(Address(to, 0), xtmp);
7126         addptr(to, 32);


7222 
7223   // Setup pointers
7224   lea(src, Address(src, len, Address::times_2)); // char[]
7225   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7226   negptr(len);
7227 
7228   if (UseSSE42Intrinsics || UseAVX >= 2) {
7229     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7230     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7231 
7232     if (UseAVX >= 2) {
7233       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7234       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7235       movdl(tmp1Reg, tmp5);
7236       vpbroadcastd(tmp1Reg, tmp1Reg);
7237       jmpb(L_chars_32_check);
7238 
7239       bind(L_copy_32_chars);
7240       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7241       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7242       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7243       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7244       jccb(Assembler::notZero, L_copy_32_chars_exit);
7245       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7246       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7247       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7248 
7249       bind(L_chars_32_check);
7250       addptr(len, 32);
7251       jccb(Assembler::lessEqual, L_copy_32_chars);
7252 
7253       bind(L_copy_32_chars_exit);
7254       subptr(len, 16);
7255       jccb(Assembler::greater, L_copy_16_chars_exit);
7256 
7257     } else if (UseSSE42Intrinsics) {
7258       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7259       movdl(tmp1Reg, tmp5);
7260       pshufd(tmp1Reg, tmp1Reg, 0);
7261       jmpb(L_chars_16_check);
7262     }
7263 
7264     bind(L_copy_16_chars);
7265     if (UseAVX >= 2) {
7266       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7267       vptest(tmp2Reg, tmp1Reg);
7268       jccb(Assembler::notZero, L_copy_16_chars_exit);
7269       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7270       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7271     } else {
7272       if (UseAVX > 0) {
7273         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7274         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7275         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7276       } else {
7277         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7278         por(tmp2Reg, tmp3Reg);
7279         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7280         por(tmp2Reg, tmp4Reg);
7281       }
7282       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7283       jccb(Assembler::notZero, L_copy_16_chars_exit);
7284       packuswb(tmp3Reg, tmp4Reg);
7285     }
7286     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7287 
7288     bind(L_chars_16_check);
7289     addptr(len, 16);
7290     jccb(Assembler::lessEqual, L_copy_16_chars);
7291 
7292     bind(L_copy_16_chars_exit);
7293     if (UseAVX >= 2) {
7294       // clean upper bits of YMM registers
7295       vpxor(tmp2Reg, tmp2Reg);


7798  *
7799  * uint32_t crc;
7800  * val = crc_table[(val ^ crc) & 0xFF];
7801  * crc = val ^ (crc >> 8);
7802  *
7803  */
7804 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7805   xorl(val, crc);
7806   andl(val, 0xFF);
7807   shrl(crc, 8); // unsigned shift
7808   xorl(crc, Address(table, val, Address::times_4, 0));
7809 }
7810 
7811 /**
7812  * Fold 128-bit data chunk
7813  */
7814 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7815   if (UseAVX > 0) {
7816     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7817     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7818     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7819     pxor(xcrc, xtmp);
7820   } else {
7821     movdqa(xtmp, xcrc);
7822     pclmulhdq(xtmp, xK);   // [123:64]
7823     pclmulldq(xcrc, xK);   // [63:0]
7824     pxor(xcrc, xtmp);
7825     movdqu(xtmp, Address(buf, offset));
7826     pxor(xcrc, xtmp);
7827   }
7828 }
7829 
7830 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7831   if (UseAVX > 0) {
7832     vpclmulhdq(xtmp, xK, xcrc);
7833     vpclmulldq(xcrc, xK, xcrc);
7834     pxor(xcrc, xbuf);
7835     pxor(xcrc, xtmp);
7836   } else {
7837     movdqa(xtmp, xcrc);
7838     pclmulhdq(xtmp, xK);


7942   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7943   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7944 
7945   // Fold the rest of 128 bits data chunks
7946   BIND(L_fold_tail);
7947   addl(len, 3);
7948   jccb(Assembler::lessEqual, L_fold_128b);
7949   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7950 
7951   BIND(L_fold_tail_loop);
7952   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7953   addptr(buf, 16);
7954   decrementl(len);
7955   jccb(Assembler::greater, L_fold_tail_loop);
7956 
7957   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7958   BIND(L_fold_128b);
7959   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7960   if (UseAVX > 0) {
7961     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7962     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7963     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7964   } else {
7965     movdqa(xmm2, xmm0);
7966     pclmulqdq(xmm2, xmm1, 0x1);
7967     movdqa(xmm3, xmm0);
7968     pand(xmm3, xmm2);
7969     pclmulqdq(xmm0, xmm3, 0x1);
7970   }
7971   psrldq(xmm1, 8);
7972   psrldq(xmm2, 4);
7973   pxor(xmm0, xmm1);
7974   pxor(xmm0, xmm2);
7975 
7976   // 8 8-bit folds to compute 32-bit CRC.
7977   for (int j = 0; j < 4; j++) {
7978     fold_8bit_crc32(xmm0, table, xmm1, rax);
7979   }
7980   movdl(crc, xmm0); // mov 32 bits to general register
7981   for (int j = 0; j < 4; j++) {
7982     fold_8bit_crc32(crc, table, rax);


src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File