# HG changeset patch # User shade # Date 1515161098 -3600 # Fri Jan 05 15:04:58 2018 +0100 # Node ID 76018e8c971e4f1b58bb0fa1eb9f5868af0ba2bb # Parent 4d7a4fad8190670f836010a1a5d12772da3c365c imported patch arraycopy-base diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -2156,6 +2156,14 @@ emit_int8((unsigned char)0xF0); } +// Emit sfence instruction +void Assembler::sfence() { + NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");) + emit_int8(0x0F); + emit_int8((unsigned char)0xAE); + emit_int8((unsigned char)0xF8); +} + void Assembler::mov(Register dst, Register src) { LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); } @@ -2507,6 +2515,30 @@ emit_operand(src, dst); } +void Assembler::vmovdqa(Address dst, XMMRegister src) { + assert(UseAVX > 0, ""); + InstructionMark im(this); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + // swap src<->dst for encoding + assert(src != xnoreg, "sanity"); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x7F); + emit_operand(src, dst); +} + +void Assembler::vmovntpd(Address dst, XMMRegister src) { + assert(UseAVX > 0, ""); + InstructionMark im(this); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + // swap src<->dst for encoding + assert(src != xnoreg, "sanity"); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x2B); + emit_operand(src, dst); +} + // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64) void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1343,6 +1343,7 @@ } void mfence(); + void sfence(); // Moves @@ -1402,6 +1403,9 @@ void vmovdqu(XMMRegister dst, Address src); void vmovdqu(XMMRegister dst, XMMRegister src); + void vmovdqa(Address dst, XMMRegister src); + void vmovntpd(Address dst, XMMRegister src); + // Move Unaligned 512bit Vector void evmovdqub(Address dst, XMMRegister src, int vector_len); void evmovdqub(XMMRegister dst, Address src, int vector_len); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -1487,8 +1487,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; - Label L_copy_byte, L_exit; + Label L_copy_qwords, L_copy_7bytes_or_less, L_copy_3bytes_or_less, L_copy_1byte_or_less, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count @@ -1512,48 +1511,261 @@ // r9 and r10 may be used to save non-volatile registers // 'from', 'to' and 'count' are now valid + + guarantee(UseAVX >= 2, "Experimental code"); + + Label L_prepare_bulk_align; + Label L_tail_32, L_tail_16, L_tail_8, L_tail_4, L_tail_end; + Label L_tail_nozero_32, L_tail_nozero_16, L_tail_nozero_8, L_tail_nozero_4, L_tail_nozero_end; + __ movptr(byte_count, count); - __ shrptr(count, 3); // count => qword_count - - // Copy from low to high addresses. Use 'to' as scratch. - __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); - __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); - __ negptr(qword_count); // make the count negative - __ jmp(L_copy_bytes); + + // If less than a qword, then don't bother doing any magic, follow to + // byte tail copy. + __ cmpptr(byte_count, 8); + __ jcc(Assembler::less, L_copy_7bytes_or_less); + + // If greater than 64 bytes, then it makes sense to prepare and go to + // the aligned copy. + __ cmpptr(byte_count, 64); + __ jccb(Assembler::greater, L_prepare_bulk_align); + + // Less than 64 bytes (8 qwords) => jump to qword copy tail. + // This requires preparing the qword_count and src/dst addresses: + __ movptr(qword_count, byte_count); + __ shrptr(qword_count, 3); + __ lea(end_from, Address(from, qword_count, Address::times_8)); + __ lea(end_to, Address(to, qword_count, Address::times_8)); + __ negptr(qword_count); + __ jmp(L_copy_qwords); + + // Pre-align slide: do enough individual copies to align destination at 32 bytes. + // At this point we know there is enough elements to hit the proper alignment, + // don't need to check byte_count. + __ BIND(L_prepare_bulk_align) + + Label L_adjust_2byte, L_adjust_4byte, L_adjust_8byte, L_adjust_16byte, L_adjust_done; + + __ lea(rscratch2, Address(to, 0)); + __ andptr(rscratch2, 31); + __ subptr(rscratch2, 32); + __ negptr(rscratch2); + __ andptr(rscratch2, 31); + + // rscratch2 holds the number of excess bytes are found; pre-slide will consume + // them. Adjust byte count here. from/to would get adjusted during the pre-slide. + __ subptr(byte_count, rscratch2); + + __ testptr(rscratch2, 1); + __ jccb(Assembler::zero, L_adjust_2byte); + __ movb(rax, Address(from, 0)); + __ movb(Address(to, 0), rax); + __ addptr(from, 1); + __ addptr(to, 1); + + __ BIND(L_adjust_2byte) + __ testptr(rscratch2, 2); + __ jccb(Assembler::zero, L_adjust_4byte); + __ movw(rax, Address(from, 0)); + __ movw(Address(to, 0), rax); + __ addptr(from, 2); + __ addptr(to, 2); + + __ BIND(L_adjust_4byte) + __ testptr(rscratch2, 4); + __ jccb(Assembler::zero, L_adjust_8byte); + __ movl(rax, Address(from, 0)); + __ movl(Address(to, 0), rax); + __ addptr(from, 4); + __ addptr(to, 4); + + __ BIND(L_adjust_8byte) + __ testptr(rscratch2, 8); + __ jccb(Assembler::zero, L_adjust_16byte); + __ movq(rax, Address(from, 0)); + __ movq(Address(to, 0), rax); + __ addptr(from, 8); + __ addptr(to, 8); + + __ BIND(L_adjust_16byte) + __ testptr(rscratch2, 16); + __ jccb(Assembler::zero, L_adjust_done); + __ movq(rax, Address(from, 0)); + __ movq(Address(to, 0), rax); + __ movq(rax, Address(from, 8)); + __ movq(Address(to, 8), rax); + __ addptr(from, 16); + __ addptr(to, 16); + + __ BIND(L_adjust_done) + + // Pre-slide done! At this point, destination is guaranteed to be aligned + // to 32. This allows us to do the bulk copies with aligned stores. + + // Prepare qword count and src/dst addresses + __ movptr(qword_count, byte_count); + __ shrptr(qword_count, 3); + __ lea(end_from, Address(from, qword_count, Address::times_8)); + __ lea(end_to, Address(to, qword_count, Address::times_8)); + __ negptr(qword_count); + + // Medium-sized arrays benefit skipping the larger bulk stores. + // Try to enter at appropriate bulk tail, this will avoid rushing + // through a size checking maze, and avoids unnecessary zeroing of the + // xmm/ymm registers. + __ addptr(qword_count, 4); + __ jcc(Assembler::greater, L_tail_nozero_end); + + __ addptr(qword_count, 4); // sub(4), add(8) + __ jcc(Assembler::greater, L_tail_nozero_4); + + __ addptr(qword_count, 8); // sub(8), add(16) + __ jcc(Assembler::greater, L_tail_nozero_8); + + __ addptr(qword_count, 16); // sub(16), add(32) + __ jcc(Assembler::greater, L_tail_nozero_16); + + __ addptr(qword_count, 32); // sub(32), add(64) + __ jcc(Assembler::greater, L_tail_nozero_32); + + // Massively parallel copy: moves lots of data on each iteration (default) + Label L_bulk_loop_default; + __ align(OptoLoopAlignment); + __ BIND(L_bulk_loop_default); + __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); + + // Remarkably, doing a single pair of 16-byte accesses helps performance: + // RESOURCE_STALLS falls abruptly. Extending this trick to all other loads + // degrades performance. :/ + __ movdqu(xmm15, Address(rscratch1, -512)); + __ vinserti128(xmm15, xmm15, Address(rscratch1, -496), 1); + + __ vmovdqu(xmm14, Address(rscratch1, -480)); + __ vmovdqu(xmm13, Address(rscratch1, -448)); __ vmovdqu(xmm12, Address(rscratch1, -416)); + __ vmovdqu(xmm11, Address(rscratch1, -384)); __ vmovdqu(xmm10, Address(rscratch1, -352)); + __ vmovdqu(xmm9, Address(rscratch1, -320)); __ vmovdqu(xmm8, Address(rscratch1, -288)); + __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224)); + __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160)); + __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96)); + __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); + + __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); + __ vmovdqa(Address(rscratch2, -512), xmm15); __ vmovdqa(Address(rscratch2, -480), xmm14); + __ vmovdqa(Address(rscratch2, -448), xmm13); __ vmovdqa(Address(rscratch2, -416), xmm12); + __ vmovdqa(Address(rscratch2, -384), xmm11); __ vmovdqa(Address(rscratch2, -352), xmm10); + __ vmovdqa(Address(rscratch2, -320), xmm9); __ vmovdqa(Address(rscratch2, -288), xmm8); + __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6); + __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4); + __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2); + __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); + + __ addptr(qword_count, 64); + __ jcc(Assembler::lessEqual, L_bulk_loop_default); + + __ BIND(L_tail_32); + __ vpxor(xmm15, xmm15); + __ vpxor(xmm14, xmm14); + __ vpxor(xmm13, xmm13); + __ vpxor(xmm12, xmm12); + __ vpxor(xmm11, xmm11); + __ vpxor(xmm10, xmm10); + __ vpxor(xmm9, xmm9); + __ vpxor(xmm8, xmm8); + __ BIND(L_tail_nozero_32); + + // Copy trailing bulk qwords, until we can: + __ subptr(qword_count, 32); // sub(64), add(32) + __ jcc(Assembler::greater, L_tail_16); + __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); + __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); + __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224)); + __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160)); + __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96)); + __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); + __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6); + __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4); + __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2); + __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); + __ addptr(qword_count, 32); + + __ BIND(L_tail_16); + __ vpxor(xmm7, xmm7); + __ vpxor(xmm6, xmm6); + __ vpxor(xmm5, xmm5); + __ vpxor(xmm4, xmm4); + __ BIND(L_tail_nozero_16); + + __ subptr(qword_count, 16); // sub(32), add(16) + __ jcc(Assembler::greater, L_tail_8); + __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); + __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); + __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1, -96)); + __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); + __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2, -96), xmm2); + __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); + __ addptr(qword_count, 16); + + __ BIND(L_tail_8); + __ vpxor(xmm3, xmm3); + __ vpxor(xmm2, xmm2); + __ BIND(L_tail_nozero_8); + + __ subptr(qword_count, 8); // sub(16), add(8) + __ jcc(Assembler::greater, L_tail_4); + __ lea(rscratch1, Address(end_from, qword_count, Address::times_8)); + __ lea(rscratch2, Address(end_to, qword_count, Address::times_8)); + __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32)); + __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0); + __ addptr(qword_count, 8); + + __ BIND(L_tail_4); + __ vpxor(xmm1, xmm1); + __ BIND(L_tail_nozero_4); + + __ subptr(qword_count, 4); // sub(8), add(4) + __ jcc(Assembler::greater, L_tail_end); + __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -32)); + __ vmovdqa(Address(end_to, qword_count, Address::times_8, -32), xmm0); + __ addptr(qword_count, 4); + + __ BIND(L_tail_end); + __ vpxor(xmm0, xmm0); + __ BIND(L_tail_nozero_end); + + __ subptr(qword_count, 4); + __ jcc(Assembler::zero, L_copy_7bytes_or_less); // Copy trailing qwords - __ BIND(L_copy_8_bytes); - __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); - __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); + __ BIND(L_copy_qwords); + __ movq(rax, Address(end_from, qword_count, Address::times_8)); + __ movq(Address(end_to, qword_count, Address::times_8), rax); __ increment(qword_count); - __ jcc(Assembler::notZero, L_copy_8_bytes); + __ jccb(Assembler::notZero, L_copy_qwords); // Check for and copy trailing dword - __ BIND(L_copy_4_bytes); - __ testl(byte_count, 4); - __ jccb(Assembler::zero, L_copy_2_bytes); - __ movl(rax, Address(end_from, 8)); - __ movl(Address(end_to, 8), rax); - + __ BIND(L_copy_7bytes_or_less); + __ testptr(byte_count, 4); + __ jccb(Assembler::zero, L_copy_3bytes_or_less); + __ movl(rax, Address(end_from, 0)); + __ movl(Address(end_to, 0), rax); __ addptr(end_from, 4); __ addptr(end_to, 4); // Check for and copy trailing word - __ BIND(L_copy_2_bytes); - __ testl(byte_count, 2); - __ jccb(Assembler::zero, L_copy_byte); - __ movw(rax, Address(end_from, 8)); - __ movw(Address(end_to, 8), rax); - + __ BIND(L_copy_3bytes_or_less); + __ testptr(byte_count, 2); + __ jccb(Assembler::zero, L_copy_1byte_or_less); + __ movw(rax, Address(end_from, 0)); + __ movw(Address(end_to, 0), rax); __ addptr(end_from, 2); __ addptr(end_to, 2); // Check for and copy trailing byte - __ BIND(L_copy_byte); - __ testl(byte_count, 1); + __ BIND(L_copy_1byte_or_less); + __ testptr(byte_count, 1); __ jccb(Assembler::zero, L_exit); - __ movb(rax, Address(end_from, 8)); - __ movb(Address(end_to, 8), rax); + __ movb(rax, Address(end_from, 0)); + __ movb(Address(end_to, 0), rax); __ BIND(L_exit); restore_arg_regs(); @@ -1563,10 +1775,6 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in multi-bytes chunks - copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); - __ jmp(L_copy_4_bytes); - return start; }