jdk-jdk Cdiff src/hotspot/cpu/aarch64/macroAssembler

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp


*** 492,502 ****
    if (!swap_reg_contains_mark) {
      null_check_offset = offset();
      ldr(swap_reg, mark_addr);
    }
    andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
!   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
    br(Assembler::NE, cas_label);
    // The bias pattern is present in the object's header. Need to check
    // whether the bias owner and the epoch are both still current.
    load_prototype_header(tmp_reg, obj_reg);
    orr(tmp_reg, tmp_reg, rthread);
--- 492,502 ----
    if (!swap_reg_contains_mark) {
      null_check_offset = offset();
      ldr(swap_reg, mark_addr);
    }
    andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
!   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
    br(Assembler::NE, cas_label);
    // The bias pattern is present in the object's header. Need to check
    // whether the bias owner and the epoch are both still current.
    load_prototype_header(tmp_reg, obj_reg);
    orr(tmp_reg, tmp_reg, rthread);
*** 631,641 ****
    // a higher level. Second, if the bias was revoked while we held the
    // lock, the object could not be rebiased toward another thread, so
    // the bias bit would be clear.
    ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
    andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
!   cmp(temp_reg, markOopDesc::biased_lock_pattern);
    br(Assembler::EQ, done);
  }
  
  static void pass_arg0(MacroAssembler* masm, Register arg) {
    if (c_rarg0 != arg ) {
--- 631,641 ----
    // a higher level. Second, if the bias was revoked while we held the
    // lock, the object could not be rebiased toward another thread, so
    // the bias bit would be clear.
    ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
    andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
!   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
    br(Assembler::EQ, done);
  }
  
  static void pass_arg0(MacroAssembler* masm, Register arg) {
    if (c_rarg0 != arg ) {
*** 1127,1137 ****
    // So if it was a primary super, we can just fail immediately.
    // Otherwise, it's the slow path for us (no success at this point).
  
    if (super_check_offset.is_register()) {
      br(Assembler::EQ, *L_success);
!     cmp(super_check_offset.as_register(), sc_offset);
      if (L_failure == &L_fallthrough) {
        br(Assembler::EQ, *L_slow_path);
      } else {
        br(Assembler::NE, *L_failure);
        final_jmp(*L_slow_path);
--- 1127,1137 ----
    // So if it was a primary super, we can just fail immediately.
    // Otherwise, it's the slow path for us (no success at this point).
  
    if (super_check_offset.is_register()) {
      br(Assembler::EQ, *L_success);
!     subs(zr, super_check_offset.as_register(), sc_offset);
      if (L_failure == &L_fallthrough) {
        br(Assembler::EQ, *L_slow_path);
      } else {
        br(Assembler::NE, *L_failure);
        final_jmp(*L_slow_path);
*** 3302,3312 ****
      add(table1, table0, 1*256*sizeof(juint));
      add(table2, table0, 2*256*sizeof(juint));
      add(table3, table0, 3*256*sizeof(juint));
  
    if (UseNeon) {
!       cmp(len, 64);
        br(Assembler::LT, L_by16);
        eor(v16, T16B, v16, v16);
  
      Label L_fold;
  
--- 3302,3312 ----
      add(table1, table0, 1*256*sizeof(juint));
      add(table2, table0, 2*256*sizeof(juint));
      add(table3, table0, 3*256*sizeof(juint));
  
    if (UseNeon) {
!       cmp(len, (u1)64);
        br(Assembler::LT, L_by16);
        eor(v16, T16B, v16, v16);
  
      Label L_fold;
  
*** 4352,4365 ****
    // For larger pattern and source we use a simplified Boyer Moore algorithm.
    // With a small pattern and source we use linear scan.
  
    if (icnt1 == -1) {
      sub(result_tmp, cnt2, cnt1);
!     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
      br(LT, LINEARSEARCH);
      dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
!     cmp(cnt1, 256);
      lsr(tmp1, cnt2, 2);
      ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
      br(GE, LINEARSTUB);
    }
  
--- 4352,4365 ----
    // For larger pattern and source we use a simplified Boyer Moore algorithm.
    // With a small pattern and source we use linear scan.
  
    if (icnt1 == -1) {
      sub(result_tmp, cnt2, cnt1);
!     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
      br(LT, LINEARSEARCH);
      dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
!     subs(zr, cnt1, 256);
      lsr(tmp1, cnt2, 2);
      ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
      br(GE, LINEARSTUB);
    }
  
*** 4461,4471 ****
        sub(ch2, cnt1, 1);
        mov(tmp3, str1);
      BIND(BCLOOP);
        (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
        if (!str1_isL) {
!         cmp(ch1, ASIZE);
          br(HS, BCSKIP);
        }
        strb(ch2, Address(sp, ch1));
      BIND(BCSKIP);
        subs(ch2, ch2, 1);
--- 4461,4471 ----
        sub(ch2, cnt1, 1);
        mov(tmp3, str1);
      BIND(BCLOOP);
        (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
        if (!str1_isL) {
!         subs(zr, ch1, ASIZE);
          br(HS, BCSKIP);
        }
        strb(ch2, Address(sp, ch1));
      BIND(BCSKIP);
        subs(ch2, ch2, 1);
*** 4525,4535 ****
          if (str1_isL != str2_isL) {
            mov(result_tmp, cnt1);
          } else {
            mov(result_tmp, 1);
          }
!         cmp(skipch, ASIZE);
          br(HS, BMADV);
        }
        ldrb(result_tmp, Address(sp, skipch)); // load skip distance
      BIND(BMADV);
        sub(cnt1tmp, cnt1, 1);
--- 4525,4535 ----
          if (str1_isL != str2_isL) {
            mov(result_tmp, cnt1);
          } else {
            mov(result_tmp, 1);
          }
!         subs(zr, skipch, ASIZE);
          br(HS, BMADV);
        }
        ldrb(result_tmp, Address(sp, skipch)); // load skip distance
      BIND(BMADV);
        sub(cnt1tmp, cnt1, 1);
*** 4546,4556 ****
        if (!str2_isL) lsr(result, result, 1);
        add(sp, sp, ASIZE);
        b(DONE);
  
      BIND(LINEARSTUB);
!     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
      br(LT, LINEAR_MEDIUM);
      mov(result, zr);
      RuntimeAddress stub = NULL;
      if (isL) {
        stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
--- 4546,4556 ----
        if (!str2_isL) lsr(result, result, 1);
        add(sp, sp, ASIZE);
        b(DONE);
  
      BIND(LINEARSTUB);
!     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
      br(LT, LINEAR_MEDIUM);
      mov(result, zr);
      RuntimeAddress stub = NULL;
      if (isL) {
        stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
*** 4575,4585 ****
  
      if (icnt1 == -1)
      {
          Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
  
!         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
          br(LT, DOSHORT);
        BIND(LINEAR_MEDIUM);
          (this->*str1_load_1chr)(first, Address(str1));
          lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
          sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
--- 4575,4585 ----
  
      if (icnt1 == -1)
      {
          Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
  
!         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
          br(LT, DOSHORT);
        BIND(LINEAR_MEDIUM);
          (this->*str1_load_1chr)(first, Address(str1));
          lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
          sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
*** 4610,4620 ****
          br(LT, STR1_NEXT);
          b(MATCH);
  
        BIND(DOSHORT);
        if (str1_isL == str2_isL) {
!         cmp(cnt1, 2);
          br(LT, DO1);
          br(GT, DO3);
        }
      }
  
--- 4610,4620 ----
          br(LT, STR1_NEXT);
          b(MATCH);
  
        BIND(DOSHORT);
        if (str1_isL == str2_isL) {
!         cmp(cnt1, (u1)2);
          br(LT, DO1);
          br(GT, DO3);
        }
      }
  
*** 4685,4695 ****
      if (icnt1 == -1 || icnt1 == 1) {
        Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
  
        BIND(DO1);
          (this->*str1_load_1chr)(ch1, str1);
!         cmp(cnt2, 8);
          br(LT, DO1_SHORT);
  
          sub(result_tmp, cnt2, 8/str2_chr_size);
          sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
          mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
--- 4685,4695 ----
      if (icnt1 == -1 || icnt1 == 1) {
        Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
  
        BIND(DO1);
          (this->*str1_load_1chr)(ch1, str1);
!         cmp(cnt2, (u1)8);
          br(LT, DO1_SHORT);
  
          sub(result_tmp, cnt2, 8/str2_chr_size);
          sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
          mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
*** 4708,4718 ****
          bics(tmp1, tmp1, tmp2);
          br(NE, HAS_ZERO);
          adds(cnt2_neg, cnt2_neg, 8);
          br(LT, CH1_LOOP);
  
!         cmp(cnt2_neg, 8);
          mov(cnt2_neg, 0);
          br(LT, CH1_LOOP);
          b(NOMATCH);
  
        BIND(HAS_ZERO);
--- 4708,4718 ----
          bics(tmp1, tmp1, tmp2);
          br(NE, HAS_ZERO);
          adds(cnt2_neg, cnt2_neg, 8);
          br(LT, CH1_LOOP);
  
!         cmp(cnt2_neg, (u1)8);
          mov(cnt2_neg, 0);
          br(LT, CH1_LOOP);
          b(NOMATCH);
  
        BIND(HAS_ZERO);
*** 4751,4761 ****
    Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
    Register cnt1_neg = cnt1;
    Register ch1 = rscratch1;
    Register result_tmp = rscratch2;
  
!   cmp(cnt1, 4);
    br(LT, DO1_SHORT);
  
    orr(ch, ch, ch, LSL, 16);
    orr(ch, ch, ch, LSL, 32);
  
--- 4751,4761 ----
    Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
    Register cnt1_neg = cnt1;
    Register ch1 = rscratch1;
    Register result_tmp = rscratch2;
  
!   cmp(cnt1, (u1)4);
    br(LT, DO1_SHORT);
  
    orr(ch, ch, ch, LSL, 16);
    orr(ch, ch, ch, LSL, 32);
  
*** 4774,4784 ****
      bics(tmp1, tmp1, tmp2);
      br(NE, HAS_ZERO);
      adds(cnt1_neg, cnt1_neg, 8);
      br(LT, CH1_LOOP);
  
!     cmp(cnt1_neg, 8);
      mov(cnt1_neg, 0);
      br(LT, CH1_LOOP);
      b(NOMATCH);
  
    BIND(HAS_ZERO);
--- 4774,4784 ----
      bics(tmp1, tmp1, tmp2);
      br(NE, HAS_ZERO);
      adds(cnt1_neg, cnt1_neg, 8);
      br(LT, CH1_LOOP);
  
!     cmp(cnt1_neg, (u1)8);
      mov(cnt1_neg, 0);
      br(LT, CH1_LOOP);
      b(NOMATCH);
  
    BIND(HAS_ZERO);
*** 4811,4821 ****
      FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
    Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
        DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
        SHORT_LOOP_START, TAIL_CHECK;
  
!   const int STUB_THRESHOLD = 64 + 8;
    bool isLL = ae == StrIntrinsicNode::LL;
    bool isLU = ae == StrIntrinsicNode::LU;
    bool isUL = ae == StrIntrinsicNode::UL;
  
    bool str1_isL = isLL || isLU;
--- 4811,4821 ----
      FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
    Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
        DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
        SHORT_LOOP_START, TAIL_CHECK;
  
!   const u1 STUB_THRESHOLD = 64 + 8;
    bool isLL = ae == StrIntrinsicNode::LL;
    bool isLU = ae == StrIntrinsicNode::LU;
    bool isUL = ae == StrIntrinsicNode::UL;
  
    bool str1_isL = isLL || isLU;
*** 5206,5219 ****
      ldrw(cnt1, Address(a1, length_offset));
      cbz(a2, DONE);
      ldrw(cnt2, Address(a2, length_offset));
      // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
      // faster to perform another branch before comparing a1 and a2
!     cmp(cnt1, elem_per_word);
      br(LE, SHORT); // short or same
      ldr(tmp3, Address(pre(a1, base_offset)));
!     cmp(cnt1, stubBytesThreshold);
      br(GE, STUB);
      ldr(tmp4, Address(pre(a2, base_offset)));
      sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
      cmp(cnt2, cnt1);
      br(NE, DONE);
--- 5206,5219 ----
      ldrw(cnt1, Address(a1, length_offset));
      cbz(a2, DONE);
      ldrw(cnt2, Address(a2, length_offset));
      // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
      // faster to perform another branch before comparing a1 and a2
!     cmp(cnt1, (u1)elem_per_word);
      br(LE, SHORT); // short or same
      ldr(tmp3, Address(pre(a1, base_offset)));
!     subs(zr, cnt1, stubBytesThreshold);
      br(GE, STUB);
      ldr(tmp4, Address(pre(a2, base_offset)));
      sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
      cmp(cnt2, cnt1);
      br(NE, DONE);
*** 5226,5236 ****
        br(LE, TAIL);
        eor(tmp4, tmp3, tmp4);
        cbnz(tmp4, DONE);
        ldr(tmp3, Address(pre(a1, wordSize)));
        ldr(tmp4, Address(pre(a2, wordSize)));
!       cmp(cnt1, elem_per_word);
        br(LE, TAIL2);
        cmp(tmp1, tmp2);
      } br(EQ, NEXT_DWORD);
      b(DONE);
  
--- 5226,5236 ----
        br(LE, TAIL);
        eor(tmp4, tmp3, tmp4);
        cbnz(tmp4, DONE);
        ldr(tmp3, Address(pre(a1, wordSize)));
        ldr(tmp4, Address(pre(a2, wordSize)));
!       cmp(cnt1, (u1)elem_per_word);
        br(LE, TAIL2);
        cmp(tmp1, tmp2);
      } br(EQ, NEXT_DWORD);
      b(DONE);
  
*** 5399,5409 ****
  {
    assert(is_power_of_2(zero_words_block_size), "adjust this");
    assert(ptr == r10 && cnt == r11, "mismatch in register usage");
  
    BLOCK_COMMENT("zero_words {");
!   cmp(cnt, zero_words_block_size);
    Label around, done, done16;
    br(LO, around);
    {
      RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
      assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
--- 5399,5409 ----
  {
    assert(is_power_of_2(zero_words_block_size), "adjust this");
    assert(ptr == r10 && cnt == r11, "mismatch in register usage");
  
    BLOCK_COMMENT("zero_words {");
!   cmp(cnt, (u1)zero_words_block_size);
    Label around, done, done16;
    br(LO, around);
    {
      RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
      assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
*** 5580,5598 ****
      Register tmp1 = rscratch1, tmp2 = rscratch2;
  
        mov(result, len); // Save initial len
  
  #ifndef BUILTIN_SIM
!       cmp(len, 8); // handle shortest strings first
        br(LT, LOOP_1);
!       cmp(len, 32);
        br(LT, NEXT_8);
        // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
        // to convert chars to bytes
        if (SoftwarePrefetchHintDistance >= 0) {
          ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
!         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
          br(LE, NEXT_32_START);
          b(NEXT_32_PRFM_START);
          BIND(NEXT_32_PRFM);
            ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
          BIND(NEXT_32_PRFM_START);
--- 5580,5598 ----
      Register tmp1 = rscratch1, tmp2 = rscratch2;
  
        mov(result, len); // Save initial len
  
  #ifndef BUILTIN_SIM
!       cmp(len, (u1)8); // handle shortest strings first
        br(LT, LOOP_1);
!       cmp(len, (u1)32);
        br(LT, NEXT_8);
        // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
        // to convert chars to bytes
        if (SoftwarePrefetchHintDistance >= 0) {
          ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
!         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
          br(LE, NEXT_32_START);
          b(NEXT_32_PRFM_START);
          BIND(NEXT_32_PRFM);
            ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
          BIND(NEXT_32_PRFM_START);
*** 5608,5620 ****
            orr(tmp1, tmp1, tmp2);
            cbnz(tmp1, LOOP_8);
            sub(len, len, 32);
            add(dst, dst, 32);
            add(src, src, 64);
!           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
            br(GE, NEXT_32_PRFM);
!           cmp(len, 32);
            br(LT, LOOP_8);
          BIND(NEXT_32);
            ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
          BIND(NEXT_32_START);
        } else {
--- 5608,5620 ----
            orr(tmp1, tmp1, tmp2);
            cbnz(tmp1, LOOP_8);
            sub(len, len, 32);
            add(dst, dst, 32);
            add(src, src, 64);
!           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
            br(GE, NEXT_32_PRFM);
!           cmp(len, (u1)32);
            br(LT, LOOP_8);
          BIND(NEXT_32);
            ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
          BIND(NEXT_32_START);
        } else {
*** 5633,5648 ****
        orr(tmp1, tmp1, tmp2);
        cbnz(tmp1, LOOP_8);
        sub(len, len, 32);
        add(dst, dst, 32);
        add(src, src, 64);
!       cmp(len, 32);
        br(GE, NEXT_32);
        cbz(len, DONE);
  
      BIND(LOOP_8);
!       cmp(len, 8);
        br(LT, LOOP_1);
      BIND(NEXT_8);
        ld1(Vtmp1, T8H, src);
        uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
        uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
--- 5633,5648 ----
        orr(tmp1, tmp1, tmp2);
        cbnz(tmp1, LOOP_8);
        sub(len, len, 32);
        add(dst, dst, 32);
        add(src, src, 64);
!       cmp(len, (u1)32);
        br(GE, NEXT_32);
        cbz(len, DONE);
  
      BIND(LOOP_8);
!       cmp(len, (u1)8);
        br(LT, LOOP_1);
      BIND(NEXT_8);
        ld1(Vtmp1, T8H, src);
        uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
        uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
*** 5651,5661 ****
        cbnz(tmp1, NEXT_1);
  
        sub(len, len, 8);
        add(dst, dst, 8);
        add(src, src, 16);
!       cmp(len, 8);
        br(GE, NEXT_8);
  
      BIND(LOOP_1);
  #endif
      cbz(len, DONE);
--- 5651,5661 ----
        cbnz(tmp1, NEXT_1);
  
        sub(len, len, 8);
        add(dst, dst, 8);
        add(src, src, 16);
!       cmp(len, (u1)8);
        br(GE, NEXT_8);
  
      BIND(LOOP_1);
  #endif
      cbz(len, DONE);
*** 5728,5738 ****
  
      if (SoftwarePrefetchHintDistance >= 0) {
        const int large_loop_threshold = (64 + 16)/8;
        ldrd(vtmp2, post(src, 8));
        andw(len, len, 7);
!       cmp(tmp4, large_loop_threshold);
        br(GE, to_stub);
        b(loop_start);
  
        bind(loop);
        ldrd(vtmp2, post(src, 8));
--- 5728,5738 ----
  
      if (SoftwarePrefetchHintDistance >= 0) {
        const int large_loop_threshold = (64 + 16)/8;
        ldrd(vtmp2, post(src, 8));
        andw(len, len, 7);
!       cmp(tmp4, (u1)large_loop_threshold);
        br(GE, to_stub);
        b(loop_start);
  
        bind(loop);
        ldrd(vtmp2, post(src, 8));
< prev index next >