hotspot Cdiff src/cpu/aarch64/vm/stubGenerator

src/cpu/aarch64/vm/stubGenerator_aarch64.cpp


*** 3668,3677 ****
--- 3668,3838 ----
      __ eor(lo, __ T16B, lo, t1);
      __ pmull(t0, __ T1Q, hi, p, __ T1D);
      __ eor(result, __ T16B, lo, t0);
    }
  
+   address generate_has_negatives(address &has_negatives_long) {
+     StubCodeMark mark(this, "StubRoutines", "has_negatives");
+     const int large_loop_size = 64;
+     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
+     int dcache_line = VM_Version::dcache_line_size();
+ 
+     Register ary1 = r1, len = r2, result = r0;
+ 
+     __ align(CodeEntryAlignment);
+     address entry = __ pc();
+ 
+     __ enter();
+ 
+   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
+         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
+ 
+   __ cmp(len, 15);
+   __ br(Assembler::GT, LEN_OVER_15);
+   // The only case when execution falls into this code is when pointer is near
+   // the end of memory page and we have to avoid reading next page
+   __ add(ary1, ary1, len);
+   __ subs(len, len, 8);
+   __ br(Assembler::GT, LEN_OVER_8);
+   __ ldr(rscratch2, Address(ary1, -8));
+   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
+   __ lsrv(rscratch2, rscratch2, rscratch1);
+   __ tst(rscratch2, UPPER_BIT_MASK);
+   __ cset(result, Assembler::NE);
+   __ leave();
+   __ ret(lr);
+   __ bind(LEN_OVER_8);
+   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
+   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
+   __ tst(rscratch2, UPPER_BIT_MASK);
+   __ br(Assembler::NE, RET_TRUE_NO_POP);
+   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
+   __ lsrv(rscratch1, rscratch1, rscratch2);
+   __ tst(rscratch1, UPPER_BIT_MASK);
+   __ cset(result, Assembler::NE);
+   __ leave();
+   __ ret(lr);
+ 
+   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
+   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
+ 
+   has_negatives_long = __ pc(); // 2nd entry point
+ 
+   __ enter();
+ 
+   __ bind(LEN_OVER_15);
+     __ push(spilled_regs, sp);
+     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
+     __ cbz(rscratch2, ALIGNED);
+     __ ldp(tmp6, tmp1, Address(ary1));
+     __ mov(tmp5, 16);
+     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
+     __ add(ary1, ary1, rscratch1);
+     __ sub(len, len, rscratch1);
+     __ orr(tmp6, tmp6, tmp1);
+     __ tst(tmp6, UPPER_BIT_MASK);
+     __ br(Assembler::NE, RET_TRUE);
+ 
+   __ bind(ALIGNED);
+     __ cmp(len, large_loop_size);
+     __ br(Assembler::LT, CHECK_16);
+     // Perform 16-byte load as early return in pre-loop to handle situation
+     // when initially aligned large array has negative values at starting bytes,
+     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
+     // slower. Cases with negative bytes further ahead won't be affected that
+     // much. In fact, it'll be faster due to early loads, less instructions and
+     // less branches in LARGE_LOOP.
+     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
+     __ sub(len, len, 16);
+     __ orr(tmp6, tmp6, tmp1);
+     __ tst(tmp6, UPPER_BIT_MASK);
+     __ br(Assembler::NE, RET_TRUE);
+     __ cmp(len, large_loop_size);
+     __ br(Assembler::LT, CHECK_16);
+ 
+     if (SoftwarePrefetchHintDistance >= 0
+         && SoftwarePrefetchHintDistance >= dcache_line) {
+       // initial prefetch
+       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
+     }
+   __ bind(LARGE_LOOP);
+     if (SoftwarePrefetchHintDistance >= 0) {
+       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
+     }
+     // Issue load instructions first, since it can save few CPU/MEM cycles, also
+     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
+     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
+     // instructions per cycle and have less branches, but this approach disables
+     // early return, thus, all 64 bytes are loaded and checked every time.
+     __ ldp(tmp2, tmp3, Address(ary1));
+     __ ldp(tmp4, tmp5, Address(ary1, 16));
+     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
+     __ ldp(tmp6, tmp1, Address(ary1, 48));
+     __ add(ary1, ary1, large_loop_size);
+     __ sub(len, len, large_loop_size);
+     __ orr(tmp2, tmp2, tmp3);
+     __ orr(tmp4, tmp4, tmp5);
+     __ orr(rscratch1, rscratch1, rscratch2);
+     __ orr(tmp6, tmp6, tmp1);
+     __ orr(tmp2, tmp2, tmp4);
+     __ orr(rscratch1, rscratch1, tmp6);
+     __ orr(tmp2, tmp2, rscratch1);
+     __ tst(tmp2, UPPER_BIT_MASK);
+     __ br(Assembler::NE, RET_TRUE);
+     __ cmp(len, large_loop_size);
+     __ br(Assembler::GE, LARGE_LOOP);
+ 
+   __ bind(CHECK_16); // small 16-byte load pre-loop 
+     __ cmp(len, 16);
+     __ br(Assembler::LT, POST_LOOP16);
+ 
+   __ bind(LOOP16); // small 16-byte load loop
+     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
+     __ sub(len, len, 16);
+     __ orr(tmp2, tmp2, tmp3);
+     __ tst(tmp2, UPPER_BIT_MASK);
+     __ br(Assembler::NE, RET_TRUE);
+     __ cmp(len, 16);
+     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
+ 
+   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
+     __ cmp(len, 8);
+     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
+     __ ldr(tmp3, Address(__ post(ary1, 8)));
+     __ sub(len, len, 8);
+     __ tst(tmp3, UPPER_BIT_MASK);
+     __ br(Assembler::NE, RET_TRUE);
+ 
+   __ bind(POST_LOOP16_LOAD_TAIL);
+     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
+     __ ldr(tmp1, Address(ary1));
+     __ mov(tmp2, 64);
+     __ sub(tmp4, tmp2, len, __ LSL, 3);
+     __ lslv(tmp1, tmp1, tmp4);
+     __ tst(tmp1, UPPER_BIT_MASK);
+     __ br(Assembler::NE, RET_TRUE);
+     // Fallthrough
+ 
+   __ bind(RET_FALSE);
+     __ pop(spilled_regs, sp);
+     __ leave();
+     __ mov(result, zr);
+     __ ret(lr);
+ 
+   __ bind(RET_TRUE);
+     __ pop(spilled_regs, sp);
+   __ bind(RET_TRUE_NO_POP);
+     __ leave();
+     __ mov(result, 1);
+     __ ret(lr);
+ 
+   __ bind(DONE);
+     __ pop(spilled_regs, sp);
+     __ leave();
+     __ ret(lr);
+     return entry;
+   }
    /**
     *  Arguments:
     *
     *  Input:
     *  c_rarg0   - current state address
*** 4684,4693 ****
--- 4845,4855 ----
      //   while (t0)
      //     t0 = sub(Pm_base, Pn_base, t0, len);
      // }
    };
  
+ 
    // Initialization
    void generate_initial() {
      // Generate initial stubs and initializes the entry points
  
      // entry points that exist in all platforms Note: This is code
*** 4742,4751 ****
--- 4904,4916 ----
                                                  throw_NullPointerException_at_call));
  
      // arraycopy stubs used by compilers
      generate_arraycopy_stubs();
  
+     // has negatives stub for large arrays.
+     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
+ 
      if (UseMultiplyToLenIntrinsic) {
        StubRoutines::_multiplyToLen = generate_multiplyToLen();
      }
  
      if (UseMontgomeryMultiplyIntrinsic) {
< prev index next >