diff -r 0edd74a48586 src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Mon Mar 07 15:03:48 2016 -0800 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Mar 08 22:32:15 2016 +0300 @@ -1329,7 +1329,6 @@ } } - // Copy big chunks forward // // Inputs: @@ -1548,16 +1547,72 @@ setup_arg_regs(); // from => rdi, to => rsi, count => rdx // r9 and r10 may be used to save non-volatile registers + __ movptr(byte_count, count); + __ testptr(byte_count, byte_count); + __ jcc(Assembler::zero, L_exit); + + Label L_copy; + Label L_preloop; + + __ xorptr(rscratch1, rscratch1); + __ BIND(L_preloop); + __ lea(rax, Address(to, rscratch1)); + __ testptr(rax, 31); + __ jcc(Assembler::zero, L_copy); + + __ movb(rax, Address(from, rscratch1)); + __ movb(Address(to, rscratch1), rax); + + __ increment(rscratch1); + __ decrement(byte_count); + __ jcc(Assembler::notZero, L_preloop); + __ jmp(L_exit); + + __ BIND(L_copy); + __ addptr(from, rscratch1); + __ addptr(to, rscratch1); + // 'from', 'to' and 'count' are now valid - __ movptr(byte_count, count); - __ shrptr(count, 3); // count => qword_count + __ movptr(qword_count, byte_count); + __ shrptr(qword_count, 3); // Copy from low to high addresses. Use 'to' as scratch. __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); __ negptr(qword_count); // make the count negative + + // Zero check + __ testptr(qword_count, qword_count); + __ jcc(Assembler::zero, L_copy_4_bytes); + + + // Enter the loop __ jmp(L_copy_bytes); - + + Label L_bulk_loop; + __ align(OptoLoopAlignment); + + // Copy 32 bytes per iteration + __ BIND(L_bulk_loop); + __ vmovdqu(xmm3, Address(end_from, qword_count, Address::times_8, -120)); + __ vmovdqu(Address(end_to, qword_count, Address::times_8, -120), xmm3); + __ vmovdqu(xmm2, Address(end_from, qword_count, Address::times_8, -88)); + __ vmovdqu(Address(end_to, qword_count, Address::times_8, -88), xmm2); + __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -56)); + __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm1); + __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); + __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); + __ BIND(L_copy_bytes); + __ addptr(qword_count, 4); + __ jcc(Assembler::lessEqual, L_bulk_loop); + __ vpxor(xmm0, xmm0); + __ vpxor(xmm1, xmm1); + __ vpxor(xmm2, xmm2); + __ vpxor(xmm3, xmm3); + + __ subptr(qword_count, 4); + __ jcc(Assembler::zero, L_copy_4_bytes); + // Copy trailing qwords __ BIND(L_copy_8_bytes); __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); @@ -1599,10 +1654,6 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in multi-bytes chunks - copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); - __ jmp(L_copy_4_bytes); - return start; }