# HG changeset patch
# User shade
# Date 1515161098 -3600
#      Fri Jan 05 15:04:58 2018 +0100
# Node ID 76018e8c971e4f1b58bb0fa1eb9f5868af0ba2bb
# Parent  4d7a4fad8190670f836010a1a5d12772da3c365c
imported patch arraycopy-base

diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@@ -2156,6 +2156,14 @@
   emit_int8((unsigned char)0xF0);
 }
 
+// Emit sfence instruction
+void Assembler::sfence() {
+  NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xAE);
+  emit_int8((unsigned char)0xF8);
+}
+
 void Assembler::mov(Register dst, Register src) {
   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 }
@@ -2507,6 +2515,30 @@
   emit_operand(src, dst);
 }
 
+void Assembler::vmovdqa(Address dst, XMMRegister src) {
+  assert(UseAVX > 0, "");
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  // swap src<->dst for encoding
+  assert(src != xnoreg, "sanity");
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
+void Assembler::vmovntpd(Address dst, XMMRegister src) {
+  assert(UseAVX > 0, "");
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  // swap src<->dst for encoding
+  assert(src != xnoreg, "sanity");
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x2B);
+  emit_operand(src, dst);
+}
+
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
 void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@@ -1343,6 +1343,7 @@
   }
 
   void mfence();
+  void sfence();
 
   // Moves
 
@@ -1402,6 +1403,9 @@
   void vmovdqu(XMMRegister dst, Address src);
   void vmovdqu(XMMRegister dst, XMMRegister src);
 
+  void vmovdqa(Address dst, XMMRegister src);
+  void vmovntpd(Address dst, XMMRegister src);
+  
    // Move Unaligned 512bit Vector
   void evmovdqub(Address dst, XMMRegister src, int vector_len);
   void evmovdqub(XMMRegister dst, Address src, int vector_len);
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -1487,8 +1487,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
-    Label L_copy_byte, L_exit;
+    Label L_copy_qwords, L_copy_7bytes_or_less, L_copy_3bytes_or_less, L_copy_1byte_or_less, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1512,48 +1511,261 @@
                       // r9 and r10 may be used to save non-volatile registers
 
     // 'from', 'to' and 'count' are now valid
+
+    guarantee(UseAVX >= 2, "Experimental code");
+
+    Label L_prepare_bulk_align;
+    Label L_tail_32, L_tail_16, L_tail_8, L_tail_4, L_tail_end;
+    Label L_tail_nozero_32, L_tail_nozero_16, L_tail_nozero_8, L_tail_nozero_4, L_tail_nozero_end;
+
     __ movptr(byte_count, count);
-    __ shrptr(count, 3); // count => qword_count
-
-    // Copy from low to high addresses.  Use 'to' as scratch.
-    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
-    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
-    __ negptr(qword_count); // make the count negative
-    __ jmp(L_copy_bytes);
+
+    // If less than a qword, then don't bother doing any magic, follow to
+    // byte tail copy.
+    __ cmpptr(byte_count, 8);
+    __ jcc(Assembler::less, L_copy_7bytes_or_less);
+
+    // If greater than 64 bytes, then it makes sense to prepare and go to
+    // the aligned copy.
+    __ cmpptr(byte_count, 64);
+    __ jccb(Assembler::greater, L_prepare_bulk_align);
+
+    // Less than 64 bytes (8 qwords) => jump to qword copy tail.
+    // This requires preparing the qword_count and src/dst addresses:
+    __ movptr(qword_count, byte_count);
+    __ shrptr(qword_count, 3);
+    __ lea(end_from, Address(from, qword_count, Address::times_8));
+    __ lea(end_to,   Address(to,   qword_count, Address::times_8));
+    __ negptr(qword_count);
+    __ jmp(L_copy_qwords);
+
+    // Pre-align slide: do enough individual copies to align destination at 32 bytes.
+    // At this point we know there is enough elements to hit the proper alignment,
+    // don't need to check byte_count.
+    __ BIND(L_prepare_bulk_align)
+
+    Label L_adjust_2byte, L_adjust_4byte, L_adjust_8byte, L_adjust_16byte, L_adjust_done;
+
+    __ lea(rscratch2, Address(to, 0));
+    __ andptr(rscratch2, 31);
+    __ subptr(rscratch2, 32);
+    __ negptr(rscratch2);
+    __ andptr(rscratch2, 31);
+
+    // rscratch2 holds the number of excess bytes are found; pre-slide will consume
+    // them. Adjust byte count here. from/to would get adjusted during the pre-slide.
+    __ subptr(byte_count, rscratch2);
+
+    __ testptr(rscratch2, 1);
+    __ jccb(Assembler::zero, L_adjust_2byte);
+    __ movb(rax, Address(from, 0));
+    __ movb(Address(to, 0), rax);
+    __ addptr(from, 1);
+    __ addptr(to, 1);
+
+    __ BIND(L_adjust_2byte)
+    __ testptr(rscratch2, 2);
+    __ jccb(Assembler::zero, L_adjust_4byte);
+    __ movw(rax, Address(from, 0));
+    __ movw(Address(to, 0), rax);
+    __ addptr(from, 2);
+    __ addptr(to, 2);
+
+    __ BIND(L_adjust_4byte)
+    __ testptr(rscratch2, 4);
+    __ jccb(Assembler::zero, L_adjust_8byte);
+    __ movl(rax, Address(from, 0));
+    __ movl(Address(to, 0), rax);
+    __ addptr(from, 4);
+    __ addptr(to, 4);
+
+    __ BIND(L_adjust_8byte)
+    __ testptr(rscratch2, 8);
+    __ jccb(Assembler::zero, L_adjust_16byte);
+    __ movq(rax, Address(from, 0));
+    __ movq(Address(to, 0), rax);
+    __ addptr(from, 8);
+    __ addptr(to, 8);
+
+    __ BIND(L_adjust_16byte)
+    __ testptr(rscratch2, 16);
+    __ jccb(Assembler::zero, L_adjust_done);
+    __ movq(rax, Address(from, 0));
+    __ movq(Address(to, 0), rax);
+    __ movq(rax, Address(from, 8));
+    __ movq(Address(to, 8), rax);
+    __ addptr(from, 16);
+    __ addptr(to, 16);
+
+    __ BIND(L_adjust_done)
+
+    // Pre-slide done! At this point, destination is guaranteed to be aligned
+    // to 32. This allows us to do the bulk copies with aligned stores.
+
+    // Prepare qword count and src/dst addresses
+    __ movptr(qword_count, byte_count);
+    __ shrptr(qword_count, 3);
+    __ lea(end_from, Address(from, qword_count, Address::times_8));
+    __ lea(end_to,   Address(to,   qword_count, Address::times_8));
+    __ negptr(qword_count);
+
+    // Medium-sized arrays benefit skipping the larger bulk stores.
+    // Try to enter at appropriate bulk tail, this will avoid rushing
+    // through a size checking maze, and avoids unnecessary zeroing of the
+    // xmm/ymm registers.
+    __ addptr(qword_count, 4);
+    __ jcc(Assembler::greater, L_tail_nozero_end);
+
+    __ addptr(qword_count, 4); // sub(4), add(8)
+    __ jcc(Assembler::greater, L_tail_nozero_4);
+
+    __ addptr(qword_count, 8); // sub(8), add(16)
+    __ jcc(Assembler::greater, L_tail_nozero_8);
+
+    __ addptr(qword_count, 16); // sub(16), add(32)
+    __ jcc(Assembler::greater, L_tail_nozero_16);
+
+    __ addptr(qword_count, 32); // sub(32), add(64)
+    __ jcc(Assembler::greater, L_tail_nozero_32);
+
+    // Massively parallel copy: moves lots of data on each iteration (default)
+    Label L_bulk_loop_default;
+    __ align(OptoLoopAlignment);
+    __ BIND(L_bulk_loop_default);
+      __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
+
+      // Remarkably, doing a single pair of 16-byte accesses helps performance:
+      // RESOURCE_STALLS falls abruptly. Extending this trick to all other loads
+      // degrades performance. :/
+      __ movdqu(xmm15, Address(rscratch1, -512));
+      __ vinserti128(xmm15, xmm15, Address(rscratch1, -496), 1);
+
+                                                   __ vmovdqu(xmm14, Address(rscratch1, -480));
+      __ vmovdqu(xmm13, Address(rscratch1, -448)); __ vmovdqu(xmm12, Address(rscratch1, -416));
+      __ vmovdqu(xmm11, Address(rscratch1, -384)); __ vmovdqu(xmm10, Address(rscratch1, -352));
+      __ vmovdqu(xmm9,  Address(rscratch1, -320)); __ vmovdqu(xmm8,  Address(rscratch1, -288));
+      __ vmovdqu(xmm7,  Address(rscratch1, -256)); __ vmovdqu(xmm6,  Address(rscratch1, -224));
+      __ vmovdqu(xmm5,  Address(rscratch1, -192)); __ vmovdqu(xmm4,  Address(rscratch1, -160));
+      __ vmovdqu(xmm3,  Address(rscratch1, -128)); __ vmovdqu(xmm2,  Address(rscratch1,  -96));
+      __ vmovdqu(xmm1,  Address(rscratch1,  -64)); __ vmovdqu(xmm0,  Address(rscratch1,  -32));
+
+      __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
+      __ vmovdqa(Address(rscratch2, -512), xmm15); __ vmovdqa(Address(rscratch2, -480), xmm14);
+      __ vmovdqa(Address(rscratch2, -448), xmm13); __ vmovdqa(Address(rscratch2, -416), xmm12);
+      __ vmovdqa(Address(rscratch2, -384), xmm11); __ vmovdqa(Address(rscratch2, -352), xmm10);
+      __ vmovdqa(Address(rscratch2, -320), xmm9);  __ vmovdqa(Address(rscratch2, -288), xmm8);
+      __ vmovdqa(Address(rscratch2, -256), xmm7);  __ vmovdqa(Address(rscratch2, -224), xmm6);
+      __ vmovdqa(Address(rscratch2, -192), xmm5);  __ vmovdqa(Address(rscratch2, -160), xmm4);
+      __ vmovdqa(Address(rscratch2, -128), xmm3);  __ vmovdqa(Address(rscratch2,  -96), xmm2);
+      __ vmovdqa(Address(rscratch2,  -64), xmm1);  __ vmovdqa(Address(rscratch2,  -32), xmm0);
+
+      __ addptr(qword_count, 64);
+      __ jcc(Assembler::lessEqual, L_bulk_loop_default);
+
+    __ BIND(L_tail_32);
+    __ vpxor(xmm15, xmm15);
+    __ vpxor(xmm14, xmm14);
+    __ vpxor(xmm13, xmm13);
+    __ vpxor(xmm12, xmm12);
+    __ vpxor(xmm11, xmm11);
+    __ vpxor(xmm10, xmm10);
+    __ vpxor(xmm9, xmm9);
+    __ vpxor(xmm8, xmm8);
+    __ BIND(L_tail_nozero_32);
+
+    // Copy trailing bulk qwords, until we can:
+    __ subptr(qword_count, 32); // sub(64), add(32)
+    __ jcc(Assembler::greater, L_tail_16);
+    __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
+    __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
+    __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224));
+    __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160));
+    __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1,  -96));
+    __ vmovdqu(xmm1, Address(rscratch1,  -64)); __ vmovdqu(xmm0, Address(rscratch1,  -32));
+    __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6);
+    __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4);
+    __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2,  -96), xmm2);
+    __ vmovdqa(Address(rscratch2,  -64), xmm1); __ vmovdqa(Address(rscratch2,  -32), xmm0);
+    __ addptr(qword_count, 32);
+
+    __ BIND(L_tail_16);
+    __ vpxor(xmm7, xmm7);
+    __ vpxor(xmm6, xmm6);
+    __ vpxor(xmm5, xmm5);
+    __ vpxor(xmm4, xmm4);
+    __ BIND(L_tail_nozero_16);
+
+    __ subptr(qword_count, 16); // sub(32), add(16)
+    __ jcc(Assembler::greater, L_tail_8);
+    __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
+    __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
+    __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1,  -96));
+    __ vmovdqu(xmm1, Address(rscratch1,  -64)); __ vmovdqu(xmm0, Address(rscratch1,  -32));
+    __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2,  -96), xmm2);
+    __ vmovdqa(Address(rscratch2,  -64), xmm1); __ vmovdqa(Address(rscratch2,  -32), xmm0);
+    __ addptr(qword_count, 16);
+
+    __ BIND(L_tail_8);
+    __ vpxor(xmm3, xmm3);
+    __ vpxor(xmm2, xmm2);
+    __ BIND(L_tail_nozero_8);
+
+    __ subptr(qword_count, 8); // sub(16), add(8)
+    __ jcc(Assembler::greater, L_tail_4);
+    __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
+    __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
+    __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32));
+    __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0);
+    __ addptr(qword_count, 8);
+
+    __ BIND(L_tail_4);
+    __ vpxor(xmm1, xmm1);
+    __ BIND(L_tail_nozero_4);
+
+    __ subptr(qword_count, 4); // sub(8), add(4)
+    __ jcc(Assembler::greater, L_tail_end);
+    __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -32));
+    __ vmovdqa(Address(end_to, qword_count, Address::times_8, -32), xmm0);
+    __ addptr(qword_count, 4);
+
+    __ BIND(L_tail_end);
+    __ vpxor(xmm0, xmm0);
+    __ BIND(L_tail_nozero_end);
+
+    __ subptr(qword_count, 4);
+    __ jcc(Assembler::zero, L_copy_7bytes_or_less);
 
     // Copy trailing qwords
-  __ BIND(L_copy_8_bytes);
-    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
-    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
+  __ BIND(L_copy_qwords);
+    __ movq(rax, Address(end_from, qword_count, Address::times_8));
+    __ movq(Address(end_to, qword_count, Address::times_8), rax);
     __ increment(qword_count);
-    __ jcc(Assembler::notZero, L_copy_8_bytes);
+    __ jccb(Assembler::notZero, L_copy_qwords);
 
     // Check for and copy trailing dword
-  __ BIND(L_copy_4_bytes);
-    __ testl(byte_count, 4);
-    __ jccb(Assembler::zero, L_copy_2_bytes);
-    __ movl(rax, Address(end_from, 8));
-    __ movl(Address(end_to, 8), rax);
-
+  __ BIND(L_copy_7bytes_or_less);
+    __ testptr(byte_count, 4);
+    __ jccb(Assembler::zero, L_copy_3bytes_or_less);
+    __ movl(rax, Address(end_from, 0));
+    __ movl(Address(end_to, 0), rax);
     __ addptr(end_from, 4);
     __ addptr(end_to, 4);
 
     // Check for and copy trailing word
-  __ BIND(L_copy_2_bytes);
-    __ testl(byte_count, 2);
-    __ jccb(Assembler::zero, L_copy_byte);
-    __ movw(rax, Address(end_from, 8));
-    __ movw(Address(end_to, 8), rax);
-
+  __ BIND(L_copy_3bytes_or_less);
+    __ testptr(byte_count, 2);
+    __ jccb(Assembler::zero, L_copy_1byte_or_less);
+    __ movw(rax, Address(end_from, 0));
+    __ movw(Address(end_to, 0), rax);
     __ addptr(end_from, 2);
     __ addptr(end_to, 2);
 
     // Check for and copy trailing byte
-  __ BIND(L_copy_byte);
-    __ testl(byte_count, 1);
+  __ BIND(L_copy_1byte_or_less);
+    __ testptr(byte_count, 1);
     __ jccb(Assembler::zero, L_exit);
-    __ movb(rax, Address(end_from, 8));
-    __ movb(Address(end_to, 8), rax);
+    __ movb(rax, Address(end_from, 0));
+    __ movb(Address(end_to, 0), rax);
 
   __ BIND(L_exit);
     restore_arg_regs();
@@ -1563,10 +1775,6 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-    // Copy in multi-bytes chunks
-    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
-    __ jmp(L_copy_4_bytes);
-
     return start;
   }