< prev index next >
src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
Print this page
rev 48409 : imported patch arraycopy-base
*** 1485,1496 ****
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
! Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
! Label L_copy_byte, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
const Register byte_count = rcx;
const Register qword_count = count;
--- 1485,1495 ----
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
! Label L_copy_qwords, L_copy_7bytes_or_less, L_copy_3bytes_or_less, L_copy_1byte_or_less, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
const Register byte_count = rcx;
const Register qword_count = count;
*** 1510,1574 ****
setup_arg_regs(); // from => rdi, to => rsi, count => rdx
// r9 and r10 may be used to save non-volatile registers
// 'from', 'to' and 'count' are now valid
__ movptr(byte_count, count);
- __ shrptr(count, 3); // count => qword_count
! // Copy from low to high addresses. Use 'to' as scratch.
! __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
! __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
! __ negptr(qword_count); // make the count negative
! __ jmp(L_copy_bytes);
// Copy trailing qwords
! __ BIND(L_copy_8_bytes);
! __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
! __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
__ increment(qword_count);
! __ jcc(Assembler::notZero, L_copy_8_bytes);
// Check for and copy trailing dword
! __ BIND(L_copy_4_bytes);
! __ testl(byte_count, 4);
! __ jccb(Assembler::zero, L_copy_2_bytes);
! __ movl(rax, Address(end_from, 8));
! __ movl(Address(end_to, 8), rax);
!
__ addptr(end_from, 4);
__ addptr(end_to, 4);
// Check for and copy trailing word
! __ BIND(L_copy_2_bytes);
! __ testl(byte_count, 2);
! __ jccb(Assembler::zero, L_copy_byte);
! __ movw(rax, Address(end_from, 8));
! __ movw(Address(end_to, 8), rax);
!
__ addptr(end_from, 2);
__ addptr(end_to, 2);
// Check for and copy trailing byte
! __ BIND(L_copy_byte);
! __ testl(byte_count, 1);
__ jccb(Assembler::zero, L_exit);
! __ movb(rax, Address(end_from, 8));
! __ movb(Address(end_to, 8), rax);
__ BIND(L_exit);
restore_arg_regs();
inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
__ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
- // Copy in multi-bytes chunks
- copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
- __ jmp(L_copy_4_bytes);
-
return start;
}
// Arguments:
// aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
--- 1509,1782 ----
setup_arg_regs(); // from => rdi, to => rsi, count => rdx
// r9 and r10 may be used to save non-volatile registers
// 'from', 'to' and 'count' are now valid
+
+ guarantee(UseAVX >= 2, "Experimental code");
+
+ Label L_prepare_bulk_align;
+ Label L_tail_32, L_tail_16, L_tail_8, L_tail_4, L_tail_end;
+ Label L_tail_nozero_32, L_tail_nozero_16, L_tail_nozero_8, L_tail_nozero_4, L_tail_nozero_end;
+
__ movptr(byte_count, count);
! // If less than a qword, then don't bother doing any magic, follow to
! // byte tail copy.
! __ cmpptr(byte_count, 8);
! __ jcc(Assembler::less, L_copy_7bytes_or_less);
!
! // If greater than 64 bytes, then it makes sense to prepare and go to
! // the aligned copy.
! __ cmpptr(byte_count, 64);
! __ jccb(Assembler::greater, L_prepare_bulk_align);
!
! // Less than 64 bytes (8 qwords) => jump to qword copy tail.
! // This requires preparing the qword_count and src/dst addresses:
! __ movptr(qword_count, byte_count);
! __ shrptr(qword_count, 3);
! __ lea(end_from, Address(from, qword_count, Address::times_8));
! __ lea(end_to, Address(to, qword_count, Address::times_8));
! __ negptr(qword_count);
! __ jmp(L_copy_qwords);
!
! // Pre-align slide: do enough individual copies to align destination at 32 bytes.
! // At this point we know there is enough elements to hit the proper alignment,
! // don't need to check byte_count.
! __ BIND(L_prepare_bulk_align)
!
! Label L_adjust_2byte, L_adjust_4byte, L_adjust_8byte, L_adjust_16byte, L_adjust_done;
!
! __ lea(rscratch2, Address(to, 0));
! __ andptr(rscratch2, 31);
! __ subptr(rscratch2, 32);
! __ negptr(rscratch2);
! __ andptr(rscratch2, 31);
!
! // rscratch2 holds the number of excess bytes are found; pre-slide will consume
! // them. Adjust byte count here. from/to would get adjusted during the pre-slide.
! __ subptr(byte_count, rscratch2);
!
! __ testptr(rscratch2, 1);
! __ jccb(Assembler::zero, L_adjust_2byte);
! __ movb(rax, Address(from, 0));
! __ movb(Address(to, 0), rax);
! __ addptr(from, 1);
! __ addptr(to, 1);
!
! __ BIND(L_adjust_2byte)
! __ testptr(rscratch2, 2);
! __ jccb(Assembler::zero, L_adjust_4byte);
! __ movw(rax, Address(from, 0));
! __ movw(Address(to, 0), rax);
! __ addptr(from, 2);
! __ addptr(to, 2);
!
! __ BIND(L_adjust_4byte)
! __ testptr(rscratch2, 4);
! __ jccb(Assembler::zero, L_adjust_8byte);
! __ movl(rax, Address(from, 0));
! __ movl(Address(to, 0), rax);
! __ addptr(from, 4);
! __ addptr(to, 4);
!
! __ BIND(L_adjust_8byte)
! __ testptr(rscratch2, 8);
! __ jccb(Assembler::zero, L_adjust_16byte);
! __ movq(rax, Address(from, 0));
! __ movq(Address(to, 0), rax);
! __ addptr(from, 8);
! __ addptr(to, 8);
!
! __ BIND(L_adjust_16byte)
! __ testptr(rscratch2, 16);
! __ jccb(Assembler::zero, L_adjust_done);
! __ movq(rax, Address(from, 0));
! __ movq(Address(to, 0), rax);
! __ movq(rax, Address(from, 8));
! __ movq(Address(to, 8), rax);
! __ addptr(from, 16);
! __ addptr(to, 16);
!
! __ BIND(L_adjust_done)
!
! // Pre-slide done! At this point, destination is guaranteed to be aligned
! // to 32. This allows us to do the bulk copies with aligned stores.
!
! // Prepare qword count and src/dst addresses
! __ movptr(qword_count, byte_count);
! __ shrptr(qword_count, 3);
! __ lea(end_from, Address(from, qword_count, Address::times_8));
! __ lea(end_to, Address(to, qword_count, Address::times_8));
! __ negptr(qword_count);
!
! // Medium-sized arrays benefit skipping the larger bulk stores.
! // Try to enter at appropriate bulk tail, this will avoid rushing
! // through a size checking maze, and avoids unnecessary zeroing of the
! // xmm/ymm registers.
! __ addptr(qword_count, 4);
! __ jcc(Assembler::greater, L_tail_nozero_end);
!
! __ addptr(qword_count, 4); // sub(4), add(8)
! __ jcc(Assembler::greater, L_tail_nozero_4);
!
! __ addptr(qword_count, 8); // sub(8), add(16)
! __ jcc(Assembler::greater, L_tail_nozero_8);
!
! __ addptr(qword_count, 16); // sub(16), add(32)
! __ jcc(Assembler::greater, L_tail_nozero_16);
!
! __ addptr(qword_count, 32); // sub(32), add(64)
! __ jcc(Assembler::greater, L_tail_nozero_32);
!
! // Massively parallel copy: moves lots of data on each iteration (default)
! Label L_bulk_loop_default;
! __ align(OptoLoopAlignment);
! __ BIND(L_bulk_loop_default);
! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
!
! // Remarkably, doing a single pair of 16-byte accesses helps performance:
! // RESOURCE_STALLS falls abruptly. Extending this trick to all other loads
! // degrades performance. :/
! __ movdqu(xmm15, Address(rscratch1, -512));
! __ vinserti128(xmm15, xmm15, Address(rscratch1, -496), 1);
!
! __ vmovdqu(xmm14, Address(rscratch1, -480));
! __ vmovdqu(xmm13, Address(rscratch1, -448)); __ vmovdqu(xmm12, Address(rscratch1, -416));
! __ vmovdqu(xmm11, Address(rscratch1, -384)); __ vmovdqu(xmm10, Address(rscratch1, -352));
! __ vmovdqu(xmm9, Address(rscratch1, -320)); __ vmovdqu(xmm8, Address(rscratch1, -288));
! __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224));
! __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160));
! __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96));
! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32));
!
! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8));
! __ vmovdqa(Address(rscratch2, -512), xmm15); __ vmovdqa(Address(rscratch2, -480), xmm14);
! __ vmovdqa(Address(rscratch2, -448), xmm13); __ vmovdqa(Address(rscratch2, -416), xmm12);
! __ vmovdqa(Address(rscratch2, -384), xmm11); __ vmovdqa(Address(rscratch2, -352), xmm10);
! __ vmovdqa(Address(rscratch2, -320), xmm9); __ vmovdqa(Address(rscratch2, -288), xmm8);
! __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6);
! __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4);
! __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2);
! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0);
!
! __ addptr(qword_count, 64);
! __ jcc(Assembler::lessEqual, L_bulk_loop_default);
!
! __ BIND(L_tail_32);
! __ vpxor(xmm15, xmm15);
! __ vpxor(xmm14, xmm14);
! __ vpxor(xmm13, xmm13);
! __ vpxor(xmm12, xmm12);
! __ vpxor(xmm11, xmm11);
! __ vpxor(xmm10, xmm10);
! __ vpxor(xmm9, xmm9);
! __ vpxor(xmm8, xmm8);
! __ BIND(L_tail_nozero_32);
!
! // Copy trailing bulk qwords, until we can:
! __ subptr(qword_count, 32); // sub(64), add(32)
! __ jcc(Assembler::greater, L_tail_16);
! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8));
! __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224));
! __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160));
! __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96));
! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32));
! __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6);
! __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4);
! __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2);
! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0);
! __ addptr(qword_count, 32);
!
! __ BIND(L_tail_16);
! __ vpxor(xmm7, xmm7);
! __ vpxor(xmm6, xmm6);
! __ vpxor(xmm5, xmm5);
! __ vpxor(xmm4, xmm4);
! __ BIND(L_tail_nozero_16);
!
! __ subptr(qword_count, 16); // sub(32), add(16)
! __ jcc(Assembler::greater, L_tail_8);
! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8));
! __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96));
! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32));
! __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2);
! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0);
! __ addptr(qword_count, 16);
!
! __ BIND(L_tail_8);
! __ vpxor(xmm3, xmm3);
! __ vpxor(xmm2, xmm2);
! __ BIND(L_tail_nozero_8);
!
! __ subptr(qword_count, 8); // sub(16), add(8)
! __ jcc(Assembler::greater, L_tail_4);
! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8));
! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32));
! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0);
! __ addptr(qword_count, 8);
!
! __ BIND(L_tail_4);
! __ vpxor(xmm1, xmm1);
! __ BIND(L_tail_nozero_4);
!
! __ subptr(qword_count, 4); // sub(8), add(4)
! __ jcc(Assembler::greater, L_tail_end);
! __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -32));
! __ vmovdqa(Address(end_to, qword_count, Address::times_8, -32), xmm0);
! __ addptr(qword_count, 4);
!
! __ BIND(L_tail_end);
! __ vpxor(xmm0, xmm0);
! __ BIND(L_tail_nozero_end);
!
! __ subptr(qword_count, 4);
! __ jcc(Assembler::zero, L_copy_7bytes_or_less);
// Copy trailing qwords
! __ BIND(L_copy_qwords);
! __ movq(rax, Address(end_from, qword_count, Address::times_8));
! __ movq(Address(end_to, qword_count, Address::times_8), rax);
__ increment(qword_count);
! __ jccb(Assembler::notZero, L_copy_qwords);
// Check for and copy trailing dword
! __ BIND(L_copy_7bytes_or_less);
! __ testptr(byte_count, 4);
! __ jccb(Assembler::zero, L_copy_3bytes_or_less);
! __ movl(rax, Address(end_from, 0));
! __ movl(Address(end_to, 0), rax);
__ addptr(end_from, 4);
__ addptr(end_to, 4);
// Check for and copy trailing word
! __ BIND(L_copy_3bytes_or_less);
! __ testptr(byte_count, 2);
! __ jccb(Assembler::zero, L_copy_1byte_or_less);
! __ movw(rax, Address(end_from, 0));
! __ movw(Address(end_to, 0), rax);
__ addptr(end_from, 2);
__ addptr(end_to, 2);
// Check for and copy trailing byte
! __ BIND(L_copy_1byte_or_less);
! __ testptr(byte_count, 1);
__ jccb(Assembler::zero, L_exit);
! __ movb(rax, Address(end_from, 0));
! __ movb(Address(end_to, 0), rax);
__ BIND(L_exit);
restore_arg_regs();
inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
__ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
// Arguments:
// aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
< prev index next >