< prev index next >

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Print this page
rev 48409 : imported patch arraycopy-base

*** 1485,1496 **** address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); ! Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; ! Label L_copy_byte, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count const Register byte_count = rcx; const Register qword_count = count; --- 1485,1495 ---- address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); ! Label L_copy_qwords, L_copy_7bytes_or_less, L_copy_3bytes_or_less, L_copy_1byte_or_less, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count const Register byte_count = rcx; const Register qword_count = count;
*** 1510,1574 **** setup_arg_regs(); // from => rdi, to => rsi, count => rdx // r9 and r10 may be used to save non-volatile registers // 'from', 'to' and 'count' are now valid __ movptr(byte_count, count); - __ shrptr(count, 3); // count => qword_count ! // Copy from low to high addresses. Use 'to' as scratch. ! __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); ! __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); ! __ negptr(qword_count); // make the count negative ! __ jmp(L_copy_bytes); // Copy trailing qwords ! __ BIND(L_copy_8_bytes); ! __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); ! __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); __ increment(qword_count); ! __ jcc(Assembler::notZero, L_copy_8_bytes); // Check for and copy trailing dword ! __ BIND(L_copy_4_bytes); ! __ testl(byte_count, 4); ! __ jccb(Assembler::zero, L_copy_2_bytes); ! __ movl(rax, Address(end_from, 8)); ! __ movl(Address(end_to, 8), rax); ! __ addptr(end_from, 4); __ addptr(end_to, 4); // Check for and copy trailing word ! __ BIND(L_copy_2_bytes); ! __ testl(byte_count, 2); ! __ jccb(Assembler::zero, L_copy_byte); ! __ movw(rax, Address(end_from, 8)); ! __ movw(Address(end_to, 8), rax); ! __ addptr(end_from, 2); __ addptr(end_to, 2); // Check for and copy trailing byte ! __ BIND(L_copy_byte); ! __ testl(byte_count, 1); __ jccb(Assembler::zero, L_exit); ! __ movb(rax, Address(end_from, 8)); ! __ movb(Address(end_to, 8), rax); __ BIND(L_exit); restore_arg_regs(); inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in multi-bytes chunks - copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); - __ jmp(L_copy_4_bytes); - return start; } // Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary --- 1509,1782 ---- setup_arg_regs(); // from => rdi, to => rsi, count => rdx // r9 and r10 may be used to save non-volatile registers // 'from', 'to' and 'count' are now valid + + guarantee(UseAVX >= 2, "Experimental code"); + + Label L_prepare_bulk_align; + Label L_tail_32, L_tail_16, L_tail_8, L_tail_4, L_tail_end; + Label L_tail_nozero_32, L_tail_nozero_16, L_tail_nozero_8, L_tail_nozero_4, L_tail_nozero_end; + __ movptr(byte_count, count); ! // If less than a qword, then don't bother doing any magic, follow to ! // byte tail copy. ! __ cmpptr(byte_count, 8); ! __ jcc(Assembler::less, L_copy_7bytes_or_less); ! ! // If greater than 64 bytes, then it makes sense to prepare and go to ! // the aligned copy. ! __ cmpptr(byte_count, 64); ! __ jccb(Assembler::greater, L_prepare_bulk_align); ! ! // Less than 64 bytes (8 qwords) => jump to qword copy tail. ! // This requires preparing the qword_count and src/dst addresses: ! __ movptr(qword_count, byte_count); ! __ shrptr(qword_count, 3); ! __ lea(end_from, Address(from, qword_count, Address::times_8)); ! __ lea(end_to, Address(to, qword_count, Address::times_8)); ! __ negptr(qword_count); ! __ jmp(L_copy_qwords); ! ! // Pre-align slide: do enough individual copies to align destination at 32 bytes. ! // At this point we know there is enough elements to hit the proper alignment, ! // don't need to check byte_count. ! __ BIND(L_prepare_bulk_align) ! ! Label L_adjust_2byte, L_adjust_4byte, L_adjust_8byte, L_adjust_16byte, L_adjust_done; ! ! __ lea(rscratch2, Address(to, 0)); ! __ andptr(rscratch2, 31); ! __ subptr(rscratch2, 32); ! __ negptr(rscratch2); ! __ andptr(rscratch2, 31); ! ! // rscratch2 holds the number of excess bytes are found; pre-slide will consume ! // them. Adjust byte count here. from/to would get adjusted during the pre-slide. ! __ subptr(byte_count, rscratch2); ! ! __ testptr(rscratch2, 1); ! __ jccb(Assembler::zero, L_adjust_2byte); ! __ movb(rax, Address(from, 0)); ! __ movb(Address(to, 0), rax); ! __ addptr(from, 1); ! __ addptr(to, 1); ! ! __ BIND(L_adjust_2byte) ! __ testptr(rscratch2, 2); ! __ jccb(Assembler::zero, L_adjust_4byte); ! __ movw(rax, Address(from, 0)); ! __ movw(Address(to, 0), rax); ! __ addptr(from, 2); ! __ addptr(to, 2); ! ! __ BIND(L_adjust_4byte) ! __ testptr(rscratch2, 4); ! __ jccb(Assembler::zero, L_adjust_8byte); ! __ movl(rax, Address(from, 0)); ! __ movl(Address(to, 0), rax); ! __ addptr(from, 4); ! __ addptr(to, 4); ! ! __ BIND(L_adjust_8byte) ! __ testptr(rscratch2, 8); ! __ jccb(Assembler::zero, L_adjust_16byte); ! __ movq(rax, Address(from, 0)); ! __ movq(Address(to, 0), rax); ! __ addptr(from, 8); ! __ addptr(to, 8); ! ! __ BIND(L_adjust_16byte) ! __ testptr(rscratch2, 16); ! __ jccb(Assembler::zero, L_adjust_done); ! __ movq(rax, Address(from, 0)); ! __ movq(Address(to, 0), rax); ! __ movq(rax, Address(from, 8)); ! __ movq(Address(to, 8), rax); ! __ addptr(from, 16); ! __ addptr(to, 16); ! ! __ BIND(L_adjust_done) ! ! // Pre-slide done! At this point, destination is guaranteed to be aligned ! // to 32. This allows us to do the bulk copies with aligned stores. ! ! // Prepare qword count and src/dst addresses ! __ movptr(qword_count, byte_count); ! __ shrptr(qword_count, 3); ! __ lea(end_from, Address(from, qword_count, Address::times_8)); ! __ lea(end_to, Address(to, qword_count, Address::times_8)); ! __ negptr(qword_count); ! ! // Medium-sized arrays benefit skipping the larger bulk stores. ! // Try to enter at appropriate bulk tail, this will avoid rushing ! // through a size checking maze, and avoids unnecessary zeroing of the ! // xmm/ymm registers. ! __ addptr(qword_count, 4); ! __ jcc(Assembler::greater, L_tail_nozero_end); ! ! __ addptr(qword_count, 4); // sub(4), add(8) ! __ jcc(Assembler::greater, L_tail_nozero_4); ! ! __ addptr(qword_count, 8); // sub(8), add(16) ! __ jcc(Assembler::greater, L_tail_nozero_8); ! ! __ addptr(qword_count, 16); // sub(16), add(32) ! __ jcc(Assembler::greater, L_tail_nozero_16); ! ! __ addptr(qword_count, 32); // sub(32), add(64) ! __ jcc(Assembler::greater, L_tail_nozero_32); ! ! // Massively parallel copy: moves lots of data on each iteration (default) ! Label L_bulk_loop_default; ! __ align(OptoLoopAlignment); ! __ BIND(L_bulk_loop_default); ! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); ! ! // Remarkably, doing a single pair of 16-byte accesses helps performance: ! // RESOURCE_STALLS falls abruptly. Extending this trick to all other loads ! // degrades performance. :/ ! __ movdqu(xmm15, Address(rscratch1, -512)); ! __ vinserti128(xmm15, xmm15, Address(rscratch1, -496), 1); ! ! __ vmovdqu(xmm14, Address(rscratch1, -480)); ! __ vmovdqu(xmm13, Address(rscratch1, -448)); __ vmovdqu(xmm12, Address(rscratch1, -416)); ! __ vmovdqu(xmm11, Address(rscratch1, -384)); __ vmovdqu(xmm10, Address(rscratch1, -352)); ! __ vmovdqu(xmm9, Address(rscratch1, -320)); __ vmovdqu(xmm8, Address(rscratch1, -288)); ! __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224)); ! __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160)); ! __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96)); ! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); ! ! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); ! __ vmovdqa(Address(rscratch2, -512), xmm15); __ vmovdqa(Address(rscratch2, -480), xmm14); ! __ vmovdqa(Address(rscratch2, -448), xmm13); __ vmovdqa(Address(rscratch2, -416), xmm12); ! __ vmovdqa(Address(rscratch2, -384), xmm11); __ vmovdqa(Address(rscratch2, -352), xmm10); ! __ vmovdqa(Address(rscratch2, -320), xmm9); __ vmovdqa(Address(rscratch2, -288), xmm8); ! __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6); ! __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4); ! __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2); ! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); ! ! __ addptr(qword_count, 64); ! __ jcc(Assembler::lessEqual, L_bulk_loop_default); ! ! __ BIND(L_tail_32); ! __ vpxor(xmm15, xmm15); ! __ vpxor(xmm14, xmm14); ! __ vpxor(xmm13, xmm13); ! __ vpxor(xmm12, xmm12); ! __ vpxor(xmm11, xmm11); ! __ vpxor(xmm10, xmm10); ! __ vpxor(xmm9, xmm9); ! __ vpxor(xmm8, xmm8); ! __ BIND(L_tail_nozero_32); ! ! // Copy trailing bulk qwords, until we can: ! __ subptr(qword_count, 32); // sub(64), add(32) ! __ jcc(Assembler::greater, L_tail_16); ! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); ! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); ! __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224)); ! __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160)); ! __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96)); ! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); ! __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6); ! __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4); ! __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2); ! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); ! __ addptr(qword_count, 32); ! ! __ BIND(L_tail_16); ! __ vpxor(xmm7, xmm7); ! __ vpxor(xmm6, xmm6); ! __ vpxor(xmm5, xmm5); ! __ vpxor(xmm4, xmm4); ! __ BIND(L_tail_nozero_16); ! ! __ subptr(qword_count, 16); // sub(32), add(16) ! __ jcc(Assembler::greater, L_tail_8); ! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); ! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); ! __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96)); ! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); ! __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2); ! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); ! __ addptr(qword_count, 16); ! ! __ BIND(L_tail_8); ! __ vpxor(xmm3, xmm3); ! __ vpxor(xmm2, xmm2); ! __ BIND(L_tail_nozero_8); ! ! __ subptr(qword_count, 8); // sub(16), add(8) ! __ jcc(Assembler::greater, L_tail_4); ! __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); ! __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); ! __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); ! __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); ! __ addptr(qword_count, 8); ! ! __ BIND(L_tail_4); ! __ vpxor(xmm1, xmm1); ! __ BIND(L_tail_nozero_4); ! ! __ subptr(qword_count, 4); // sub(8), add(4) ! __ jcc(Assembler::greater, L_tail_end); ! __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -32)); ! __ vmovdqa(Address(end_to, qword_count, Address::times_8, -32), xmm0); ! __ addptr(qword_count, 4); ! ! __ BIND(L_tail_end); ! __ vpxor(xmm0, xmm0); ! __ BIND(L_tail_nozero_end); ! ! __ subptr(qword_count, 4); ! __ jcc(Assembler::zero, L_copy_7bytes_or_less); // Copy trailing qwords ! __ BIND(L_copy_qwords); ! __ movq(rax, Address(end_from, qword_count, Address::times_8)); ! __ movq(Address(end_to, qword_count, Address::times_8), rax); __ increment(qword_count); ! __ jccb(Assembler::notZero, L_copy_qwords); // Check for and copy trailing dword ! __ BIND(L_copy_7bytes_or_less); ! __ testptr(byte_count, 4); ! __ jccb(Assembler::zero, L_copy_3bytes_or_less); ! __ movl(rax, Address(end_from, 0)); ! __ movl(Address(end_to, 0), rax); __ addptr(end_from, 4); __ addptr(end_to, 4); // Check for and copy trailing word ! __ BIND(L_copy_3bytes_or_less); ! __ testptr(byte_count, 2); ! __ jccb(Assembler::zero, L_copy_1byte_or_less); ! __ movw(rax, Address(end_from, 0)); ! __ movw(Address(end_to, 0), rax); __ addptr(end_from, 2); __ addptr(end_to, 2); // Check for and copy trailing byte ! __ BIND(L_copy_1byte_or_less); ! __ testptr(byte_count, 1); __ jccb(Assembler::zero, L_exit); ! __ movb(rax, Address(end_from, 0)); ! __ movb(Address(end_to, 0), rax); __ BIND(L_exit); restore_arg_regs(); inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); return start; } // Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
< prev index next >