5978 movl(Address(to, 4), value);
5979 addptr(to, 8);
5980 BIND(L_fill_8_bytes);
5981 subl(count, 1 << (shift + 1));
5982 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5983 // fall through to fill 4 bytes
5984 } else {
5985 Label L_fill_32_bytes;
5986 if (!UseUnalignedLoadStores) {
5987 // align to 8 bytes, we know we are 4 byte aligned to start
5988 testptr(to, 4);
5989 jccb(Assembler::zero, L_fill_32_bytes);
5990 movl(Address(to, 0), value);
5991 addptr(to, 4);
5992 subl(count, 1<<shift);
5993 }
5994 BIND(L_fill_32_bytes);
5995 {
5996 assert( UseSSE >= 2, "supported cpu only" );
5997 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5998 // Fill 32-byte chunks
5999 movdl(xtmp, value);
6000 pshufd(xtmp, xtmp, 0);
6001
6002 subl(count, 8 << shift);
6003 jcc(Assembler::less, L_check_fill_8_bytes);
6004 align(16);
6005
6006 BIND(L_fill_32_bytes_loop);
6007
6008 if (UseUnalignedLoadStores) {
6009 movdqu(Address(to, 0), xtmp);
6010 movdqu(Address(to, 16), xtmp);
6011 } else {
6012 movq(Address(to, 0), xtmp);
6013 movq(Address(to, 8), xtmp);
6014 movq(Address(to, 16), xtmp);
6015 movq(Address(to, 24), xtmp);
6016 }
6017
6018 addptr(to, 32);
6019 subl(count, 8 << shift);
6020 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6021 BIND(L_check_fill_8_bytes);
6022 addl(count, 8 << shift);
6023 jccb(Assembler::zero, L_exit);
6024 jmpb(L_fill_8_bytes);
6025
6026 //
6027 // length is too short, just fill qwords
6028 //
6029 BIND(L_fill_8_bytes_loop);
6030 movq(Address(to, 0), xtmp);
6031 addptr(to, 8);
6032 BIND(L_fill_8_bytes);
6033 subl(count, 1 << (shift + 1));
6034 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6035 }
6036 }
6037 // fill trailing 4 bytes
6038 BIND(L_fill_4_bytes);
6039 testl(count, 1<<shift);
6040 jccb(Assembler::zero, L_fill_2_bytes);
|
5978 movl(Address(to, 4), value);
5979 addptr(to, 8);
5980 BIND(L_fill_8_bytes);
5981 subl(count, 1 << (shift + 1));
5982 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5983 // fall through to fill 4 bytes
5984 } else {
5985 Label L_fill_32_bytes;
5986 if (!UseUnalignedLoadStores) {
5987 // align to 8 bytes, we know we are 4 byte aligned to start
5988 testptr(to, 4);
5989 jccb(Assembler::zero, L_fill_32_bytes);
5990 movl(Address(to, 0), value);
5991 addptr(to, 4);
5992 subl(count, 1<<shift);
5993 }
5994 BIND(L_fill_32_bytes);
5995 {
5996 assert( UseSSE >= 2, "supported cpu only" );
5997 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5998 movdl(xtmp, value);
5999 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6000 // Fill 64-byte chunks
6001 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
6002 vpbroadcastd(xtmp, xtmp);
6003
6004 subl(count, 16 << shift);
6005 jcc(Assembler::less, L_check_fill_32_bytes);
6006 align(16);
6007
6008 BIND(L_fill_64_bytes_loop);
6009 vmovdqu(Address(to, 0), xtmp);
6010 vmovdqu(Address(to, 32), xtmp);
6011 addptr(to, 64);
6012 subl(count, 16 << shift);
6013 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6014
6015 BIND(L_check_fill_32_bytes);
6016 addl(count, 8 << shift);
6017 jccb(Assembler::less, L_check_fill_8_bytes);
6018 vmovdqu(Address(to, 0), xtmp);
6019 addptr(to, 32);
6020 subl(count, 8 << shift);
6021 } else {
6022 // Fill 32-byte chunks
6023 pshufd(xtmp, xtmp, 0);
6024
6025 subl(count, 8 << shift);
6026 jcc(Assembler::less, L_check_fill_8_bytes);
6027 align(16);
6028
6029 BIND(L_fill_32_bytes_loop);
6030
6031 if (UseUnalignedLoadStores) {
6032 movdqu(Address(to, 0), xtmp);
6033 movdqu(Address(to, 16), xtmp);
6034 } else {
6035 movq(Address(to, 0), xtmp);
6036 movq(Address(to, 8), xtmp);
6037 movq(Address(to, 16), xtmp);
6038 movq(Address(to, 24), xtmp);
6039 }
6040
6041 addptr(to, 32);
6042 subl(count, 8 << shift);
6043 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6044 }
6045 BIND(L_check_fill_8_bytes);
6046 addl(count, 8 << shift);
6047 jccb(Assembler::zero, L_exit);
6048 jmpb(L_fill_8_bytes);
6049
6050 //
6051 // length is too short, just fill qwords
6052 //
6053 BIND(L_fill_8_bytes_loop);
6054 movq(Address(to, 0), xtmp);
6055 addptr(to, 8);
6056 BIND(L_fill_8_bytes);
6057 subl(count, 1 << (shift + 1));
6058 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6059 }
6060 }
6061 // fill trailing 4 bytes
6062 BIND(L_fill_4_bytes);
6063 testl(count, 1<<shift);
6064 jccb(Assembler::zero, L_fill_2_bytes);
|