jdk-hs-arraycopy Cdiff src/hotspot/cpu/x86/stubGenerator_x86

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

rev 48409 : imported patch arraycopy-base


*** 1485,1496 ****
    address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
      __ align(CodeEntryAlignment);
      StubCodeMark mark(this, "StubRoutines", name);
      address start = __ pc();
  
!     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
!     Label L_copy_byte, L_exit;
      const Register from        = rdi;  // source array address
      const Register to          = rsi;  // destination array address
      const Register count       = rdx;  // elements count
      const Register byte_count  = rcx;
      const Register qword_count = count;
--- 1485,1495 ----
    address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
      __ align(CodeEntryAlignment);
      StubCodeMark mark(this, "StubRoutines", name);
      address start = __ pc();
  
!     Label L_copy_qwords, L_copy_7bytes_or_less, L_copy_3bytes_or_less, L_copy_1byte_or_less, L_exit;
      const Register from        = rdi;  // source array address
      const Register to          = rsi;  // destination array address
      const Register count       = rdx;  // elements count
      const Register byte_count  = rcx;
      const Register qword_count = count;
*** 1510,1574 ****
  
      setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                        // r9 and r10 may be used to save non-volatile registers
  
      // 'from', 'to' and 'count' are now valid
      __ movptr(byte_count, count);
-     __ shrptr(count, 3); // count => qword_count
  
!     // Copy from low to high addresses.  Use 'to' as scratch.
!     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
!     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
!     __ negptr(qword_count); // make the count negative
!     __ jmp(L_copy_bytes);
  
      // Copy trailing qwords
!   __ BIND(L_copy_8_bytes);
!     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
!     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
      __ increment(qword_count);
!     __ jcc(Assembler::notZero, L_copy_8_bytes);
  
      // Check for and copy trailing dword
!   __ BIND(L_copy_4_bytes);
!     __ testl(byte_count, 4);
!     __ jccb(Assembler::zero, L_copy_2_bytes);
!     __ movl(rax, Address(end_from, 8));
!     __ movl(Address(end_to, 8), rax);
! 
      __ addptr(end_from, 4);
      __ addptr(end_to, 4);
  
      // Check for and copy trailing word
!   __ BIND(L_copy_2_bytes);
!     __ testl(byte_count, 2);
!     __ jccb(Assembler::zero, L_copy_byte);
!     __ movw(rax, Address(end_from, 8));
!     __ movw(Address(end_to, 8), rax);
! 
      __ addptr(end_from, 2);
      __ addptr(end_to, 2);
  
      // Check for and copy trailing byte
!   __ BIND(L_copy_byte);
!     __ testl(byte_count, 1);
      __ jccb(Assembler::zero, L_exit);
!     __ movb(rax, Address(end_from, 8));
!     __ movb(Address(end_to, 8), rax);
  
    __ BIND(L_exit);
      restore_arg_regs();
      inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
      __ xorptr(rax, rax); // return 0
      __ vzeroupper();
      __ leave(); // required for proper stackwalking of RuntimeStub frame
      __ ret(0);
  
-     // Copy in multi-bytes chunks
-     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
-     __ jmp(L_copy_4_bytes);
- 
      return start;
    }
  
    // Arguments:
    //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
--- 1509,1782 ----
  
      setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                        // r9 and r10 may be used to save non-volatile registers
  
      // 'from', 'to' and 'count' are now valid
+ 
+     guarantee(UseAVX >= 2, "Experimental code");
+ 
+     Label L_prepare_bulk_align;
+     Label L_tail_32, L_tail_16, L_tail_8, L_tail_4, L_tail_end;
+     Label L_tail_nozero_32, L_tail_nozero_16, L_tail_nozero_8, L_tail_nozero_4, L_tail_nozero_end;
+ 
      __ movptr(byte_count, count);
  
!     // If less than a qword, then don't bother doing any magic, follow to
!     // byte tail copy.
!     __ cmpptr(byte_count, 8);
!     __ jcc(Assembler::less, L_copy_7bytes_or_less);
! 
!     // If greater than 64 bytes, then it makes sense to prepare and go to
!     // the aligned copy.
!     __ cmpptr(byte_count, 64);
!     __ jccb(Assembler::greater, L_prepare_bulk_align);
! 
!     // Less than 64 bytes (8 qwords) => jump to qword copy tail.
!     // This requires preparing the qword_count and src/dst addresses:
!     __ movptr(qword_count, byte_count);
!     __ shrptr(qword_count, 3);
!     __ lea(end_from, Address(from, qword_count, Address::times_8));
!     __ lea(end_to,   Address(to,   qword_count, Address::times_8));
!     __ negptr(qword_count);
!     __ jmp(L_copy_qwords);
! 
!     // Pre-align slide: do enough individual copies to align destination at 32 bytes.
!     // At this point we know there is enough elements to hit the proper alignment,
!     // don't need to check byte_count.
!     __ BIND(L_prepare_bulk_align)
! 
!     Label L_adjust_2byte, L_adjust_4byte, L_adjust_8byte, L_adjust_16byte, L_adjust_done;
! 
!     __ lea(rscratch2, Address(to, 0));
!     __ andptr(rscratch2, 31);
!     __ subptr(rscratch2, 32);
!     __ negptr(rscratch2);
!     __ andptr(rscratch2, 31);
! 
!     // rscratch2 holds the number of excess bytes are found; pre-slide will consume
!     // them. Adjust byte count here. from/to would get adjusted during the pre-slide.
!     __ subptr(byte_count, rscratch2);
! 
!     __ testptr(rscratch2, 1);
!     __ jccb(Assembler::zero, L_adjust_2byte);
!     __ movb(rax, Address(from, 0));
!     __ movb(Address(to, 0), rax);
!     __ addptr(from, 1);
!     __ addptr(to, 1);
! 
!     __ BIND(L_adjust_2byte)
!     __ testptr(rscratch2, 2);
!     __ jccb(Assembler::zero, L_adjust_4byte);
!     __ movw(rax, Address(from, 0));
!     __ movw(Address(to, 0), rax);
!     __ addptr(from, 2);
!     __ addptr(to, 2);
! 
!     __ BIND(L_adjust_4byte)
!     __ testptr(rscratch2, 4);
!     __ jccb(Assembler::zero, L_adjust_8byte);
!     __ movl(rax, Address(from, 0));
!     __ movl(Address(to, 0), rax);
!     __ addptr(from, 4);
!     __ addptr(to, 4);
! 
!     __ BIND(L_adjust_8byte)
!     __ testptr(rscratch2, 8);
!     __ jccb(Assembler::zero, L_adjust_16byte);
!     __ movq(rax, Address(from, 0));
!     __ movq(Address(to, 0), rax);
!     __ addptr(from, 8);
!     __ addptr(to, 8);
! 
!     __ BIND(L_adjust_16byte)
!     __ testptr(rscratch2, 16);
!     __ jccb(Assembler::zero, L_adjust_done);
!     __ movq(rax, Address(from, 0));
!     __ movq(Address(to, 0), rax);
!     __ movq(rax, Address(from, 8));
!     __ movq(Address(to, 8), rax);
!     __ addptr(from, 16);
!     __ addptr(to, 16);
! 
!     __ BIND(L_adjust_done)
! 
!     // Pre-slide done! At this point, destination is guaranteed to be aligned
!     // to 32. This allows us to do the bulk copies with aligned stores.
! 
!     // Prepare qword count and src/dst addresses
!     __ movptr(qword_count, byte_count);
!     __ shrptr(qword_count, 3);
!     __ lea(end_from, Address(from, qword_count, Address::times_8));
!     __ lea(end_to,   Address(to,   qword_count, Address::times_8));
!     __ negptr(qword_count);
! 
!     // Medium-sized arrays benefit skipping the larger bulk stores.
!     // Try to enter at appropriate bulk tail, this will avoid rushing
!     // through a size checking maze, and avoids unnecessary zeroing of the
!     // xmm/ymm registers.
!     __ addptr(qword_count, 4);
!     __ jcc(Assembler::greater, L_tail_nozero_end);
! 
!     __ addptr(qword_count, 4); // sub(4), add(8)
!     __ jcc(Assembler::greater, L_tail_nozero_4);
! 
!     __ addptr(qword_count, 8); // sub(8), add(16)
!     __ jcc(Assembler::greater, L_tail_nozero_8);
! 
!     __ addptr(qword_count, 16); // sub(16), add(32)
!     __ jcc(Assembler::greater, L_tail_nozero_16);
! 
!     __ addptr(qword_count, 32); // sub(32), add(64)
!     __ jcc(Assembler::greater, L_tail_nozero_32);
! 
!     // Massively parallel copy: moves lots of data on each iteration (default)
!     Label L_bulk_loop_default;
!     __ align(OptoLoopAlignment);
!     __ BIND(L_bulk_loop_default);
!       __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
! 
!       // Remarkably, doing a single pair of 16-byte accesses helps performance:
!       // RESOURCE_STALLS falls abruptly. Extending this trick to all other loads
!       // degrades performance. :/
!       __ movdqu(xmm15, Address(rscratch1, -512));
!       __ vinserti128(xmm15, xmm15, Address(rscratch1, -496), 1);
! 
!                                                    __ vmovdqu(xmm14, Address(rscratch1, -480));
!       __ vmovdqu(xmm13, Address(rscratch1, -448)); __ vmovdqu(xmm12, Address(rscratch1, -416));
!       __ vmovdqu(xmm11, Address(rscratch1, -384)); __ vmovdqu(xmm10, Address(rscratch1, -352));
!       __ vmovdqu(xmm9,  Address(rscratch1, -320)); __ vmovdqu(xmm8,  Address(rscratch1, -288));
!       __ vmovdqu(xmm7,  Address(rscratch1, -256)); __ vmovdqu(xmm6,  Address(rscratch1, -224));
!       __ vmovdqu(xmm5,  Address(rscratch1, -192)); __ vmovdqu(xmm4,  Address(rscratch1, -160));
!       __ vmovdqu(xmm3,  Address(rscratch1, -128)); __ vmovdqu(xmm2,  Address(rscratch1,  -96));
!       __ vmovdqu(xmm1,  Address(rscratch1,  -64)); __ vmovdqu(xmm0,  Address(rscratch1,  -32));
! 
!       __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
!       __ vmovdqa(Address(rscratch2, -512), xmm15); __ vmovdqa(Address(rscratch2, -480), xmm14);
!       __ vmovdqa(Address(rscratch2, -448), xmm13); __ vmovdqa(Address(rscratch2, -416), xmm12);
!       __ vmovdqa(Address(rscratch2, -384), xmm11); __ vmovdqa(Address(rscratch2, -352), xmm10);
!       __ vmovdqa(Address(rscratch2, -320), xmm9);  __ vmovdqa(Address(rscratch2, -288), xmm8);
!       __ vmovdqa(Address(rscratch2, -256), xmm7);  __ vmovdqa(Address(rscratch2, -224), xmm6);
!       __ vmovdqa(Address(rscratch2, -192), xmm5);  __ vmovdqa(Address(rscratch2, -160), xmm4);
!       __ vmovdqa(Address(rscratch2, -128), xmm3);  __ vmovdqa(Address(rscratch2,  -96), xmm2);
!       __ vmovdqa(Address(rscratch2,  -64), xmm1);  __ vmovdqa(Address(rscratch2,  -32), xmm0);
! 
!       __ addptr(qword_count, 64);
!       __ jcc(Assembler::lessEqual, L_bulk_loop_default);
! 
!     __ BIND(L_tail_32);
!     __ vpxor(xmm15, xmm15);
!     __ vpxor(xmm14, xmm14);
!     __ vpxor(xmm13, xmm13);
!     __ vpxor(xmm12, xmm12);
!     __ vpxor(xmm11, xmm11);
!     __ vpxor(xmm10, xmm10);
!     __ vpxor(xmm9, xmm9);
!     __ vpxor(xmm8, xmm8);
!     __ BIND(L_tail_nozero_32);
! 
!     // Copy trailing bulk qwords, until we can:
!     __ subptr(qword_count, 32); // sub(64), add(32)
!     __ jcc(Assembler::greater, L_tail_16);
!     __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
!     __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
!     __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224));
!     __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160));
!     __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1,  -96));
!     __ vmovdqu(xmm1, Address(rscratch1,  -64)); __ vmovdqu(xmm0, Address(rscratch1,  -32));
!     __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6);
!     __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4);
!     __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2,  -96), xmm2);
!     __ vmovdqa(Address(rscratch2,  -64), xmm1); __ vmovdqa(Address(rscratch2,  -32), xmm0);
!     __ addptr(qword_count, 32);
! 
!     __ BIND(L_tail_16);
!     __ vpxor(xmm7, xmm7);
!     __ vpxor(xmm6, xmm6);
!     __ vpxor(xmm5, xmm5);
!     __ vpxor(xmm4, xmm4);
!     __ BIND(L_tail_nozero_16);
! 
!     __ subptr(qword_count, 16); // sub(32), add(16)
!     __ jcc(Assembler::greater, L_tail_8);
!     __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
!     __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
!     __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1,  -96));
!     __ vmovdqu(xmm1, Address(rscratch1,  -64)); __ vmovdqu(xmm0, Address(rscratch1,  -32));
!     __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2,  -96), xmm2);
!     __ vmovdqa(Address(rscratch2,  -64), xmm1); __ vmovdqa(Address(rscratch2,  -32), xmm0);
!     __ addptr(qword_count, 16);
! 
!     __ BIND(L_tail_8);
!     __ vpxor(xmm3, xmm3);
!     __ vpxor(xmm2, xmm2);
!     __ BIND(L_tail_nozero_8);
! 
!     __ subptr(qword_count, 8); // sub(16), add(8)
!     __ jcc(Assembler::greater, L_tail_4);
!     __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
!     __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
!     __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32));
!     __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0);
!     __ addptr(qword_count, 8);
! 
!     __ BIND(L_tail_4);
!     __ vpxor(xmm1, xmm1);
!     __ BIND(L_tail_nozero_4);
! 
!     __ subptr(qword_count, 4); // sub(8), add(4)
!     __ jcc(Assembler::greater, L_tail_end);
!     __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -32));
!     __ vmovdqa(Address(end_to, qword_count, Address::times_8, -32), xmm0);
!     __ addptr(qword_count, 4);
! 
!     __ BIND(L_tail_end);
!     __ vpxor(xmm0, xmm0);
!     __ BIND(L_tail_nozero_end);
! 
!     __ subptr(qword_count, 4);
!     __ jcc(Assembler::zero, L_copy_7bytes_or_less);
  
      // Copy trailing qwords
!   __ BIND(L_copy_qwords);
!     __ movq(rax, Address(end_from, qword_count, Address::times_8));
!     __ movq(Address(end_to, qword_count, Address::times_8), rax);
      __ increment(qword_count);
!     __ jccb(Assembler::notZero, L_copy_qwords);
  
      // Check for and copy trailing dword
!   __ BIND(L_copy_7bytes_or_less);
!     __ testptr(byte_count, 4);
!     __ jccb(Assembler::zero, L_copy_3bytes_or_less);
!     __ movl(rax, Address(end_from, 0));
!     __ movl(Address(end_to, 0), rax);
      __ addptr(end_from, 4);
      __ addptr(end_to, 4);
  
      // Check for and copy trailing word
!   __ BIND(L_copy_3bytes_or_less);
!     __ testptr(byte_count, 2);
!     __ jccb(Assembler::zero, L_copy_1byte_or_less);
!     __ movw(rax, Address(end_from, 0));
!     __ movw(Address(end_to, 0), rax);
      __ addptr(end_from, 2);
      __ addptr(end_to, 2);
  
      // Check for and copy trailing byte
!   __ BIND(L_copy_1byte_or_less);
!     __ testptr(byte_count, 1);
      __ jccb(Assembler::zero, L_exit);
!     __ movb(rax, Address(end_from, 0));
!     __ movb(Address(end_to, 0), rax);
  
    __ BIND(L_exit);
      restore_arg_regs();
      inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
      __ xorptr(rax, rax); // return 0
      __ vzeroupper();
      __ leave(); // required for proper stackwalking of RuntimeStub frame
      __ ret(0);
  
      return start;
    }
  
    // Arguments:
    //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

< prev index next >