< prev index next >

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp

Print this page

        

*** 492,502 **** if (!swap_reg_contains_mark) { null_check_offset = offset(); ldr(swap_reg, mark_addr); } andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); ! cmp(tmp_reg, markOopDesc::biased_lock_pattern); br(Assembler::NE, cas_label); // The bias pattern is present in the object's header. Need to check // whether the bias owner and the epoch are both still current. load_prototype_header(tmp_reg, obj_reg); orr(tmp_reg, tmp_reg, rthread); --- 492,502 ---- if (!swap_reg_contains_mark) { null_check_offset = offset(); ldr(swap_reg, mark_addr); } andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); ! cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); br(Assembler::NE, cas_label); // The bias pattern is present in the object's header. Need to check // whether the bias owner and the epoch are both still current. load_prototype_header(tmp_reg, obj_reg); orr(tmp_reg, tmp_reg, rthread);
*** 631,641 **** // a higher level. Second, if the bias was revoked while we held the // lock, the object could not be rebiased toward another thread, so // the bias bit would be clear. ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); ! cmp(temp_reg, markOopDesc::biased_lock_pattern); br(Assembler::EQ, done); } static void pass_arg0(MacroAssembler* masm, Register arg) { if (c_rarg0 != arg ) { --- 631,641 ---- // a higher level. Second, if the bias was revoked while we held the // lock, the object could not be rebiased toward another thread, so // the bias bit would be clear. ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); ! cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); br(Assembler::EQ, done); } static void pass_arg0(MacroAssembler* masm, Register arg) { if (c_rarg0 != arg ) {
*** 1127,1137 **** // So if it was a primary super, we can just fail immediately. // Otherwise, it's the slow path for us (no success at this point). if (super_check_offset.is_register()) { br(Assembler::EQ, *L_success); ! cmp(super_check_offset.as_register(), sc_offset); if (L_failure == &L_fallthrough) { br(Assembler::EQ, *L_slow_path); } else { br(Assembler::NE, *L_failure); final_jmp(*L_slow_path); --- 1127,1137 ---- // So if it was a primary super, we can just fail immediately. // Otherwise, it's the slow path for us (no success at this point). if (super_check_offset.is_register()) { br(Assembler::EQ, *L_success); ! subs(zr, super_check_offset.as_register(), sc_offset); if (L_failure == &L_fallthrough) { br(Assembler::EQ, *L_slow_path); } else { br(Assembler::NE, *L_failure); final_jmp(*L_slow_path);
*** 3302,3312 **** add(table1, table0, 1*256*sizeof(juint)); add(table2, table0, 2*256*sizeof(juint)); add(table3, table0, 3*256*sizeof(juint)); if (UseNeon) { ! cmp(len, 64); br(Assembler::LT, L_by16); eor(v16, T16B, v16, v16); Label L_fold; --- 3302,3312 ---- add(table1, table0, 1*256*sizeof(juint)); add(table2, table0, 2*256*sizeof(juint)); add(table3, table0, 3*256*sizeof(juint)); if (UseNeon) { ! cmp(len, (u1)64); br(Assembler::LT, L_by16); eor(v16, T16B, v16, v16); Label L_fold;
*** 4352,4365 **** // For larger pattern and source we use a simplified Boyer Moore algorithm. // With a small pattern and source we use linear scan. if (icnt1 == -1) { sub(result_tmp, cnt2, cnt1); ! cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 br(LT, LINEARSEARCH); dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty ! cmp(cnt1, 256); lsr(tmp1, cnt2, 2); ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM br(GE, LINEARSTUB); } --- 4352,4365 ---- // For larger pattern and source we use a simplified Boyer Moore algorithm. // With a small pattern and source we use linear scan. if (icnt1 == -1) { sub(result_tmp, cnt2, cnt1); ! cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 br(LT, LINEARSEARCH); dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty ! subs(zr, cnt1, 256); lsr(tmp1, cnt2, 2); ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM br(GE, LINEARSTUB); }
*** 4461,4471 **** sub(ch2, cnt1, 1); mov(tmp3, str1); BIND(BCLOOP); (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); if (!str1_isL) { ! cmp(ch1, ASIZE); br(HS, BCSKIP); } strb(ch2, Address(sp, ch1)); BIND(BCSKIP); subs(ch2, ch2, 1); --- 4461,4471 ---- sub(ch2, cnt1, 1); mov(tmp3, str1); BIND(BCLOOP); (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); if (!str1_isL) { ! subs(zr, ch1, ASIZE); br(HS, BCSKIP); } strb(ch2, Address(sp, ch1)); BIND(BCSKIP); subs(ch2, ch2, 1);
*** 4525,4535 **** if (str1_isL != str2_isL) { mov(result_tmp, cnt1); } else { mov(result_tmp, 1); } ! cmp(skipch, ASIZE); br(HS, BMADV); } ldrb(result_tmp, Address(sp, skipch)); // load skip distance BIND(BMADV); sub(cnt1tmp, cnt1, 1); --- 4525,4535 ---- if (str1_isL != str2_isL) { mov(result_tmp, cnt1); } else { mov(result_tmp, 1); } ! subs(zr, skipch, ASIZE); br(HS, BMADV); } ldrb(result_tmp, Address(sp, skipch)); // load skip distance BIND(BMADV); sub(cnt1tmp, cnt1, 1);
*** 4546,4556 **** if (!str2_isL) lsr(result, result, 1); add(sp, sp, ASIZE); b(DONE); BIND(LINEARSTUB); ! cmp(cnt1, 16); // small patterns still should be handled by simple algorithm br(LT, LINEAR_MEDIUM); mov(result, zr); RuntimeAddress stub = NULL; if (isL) { stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); --- 4546,4556 ---- if (!str2_isL) lsr(result, result, 1); add(sp, sp, ASIZE); b(DONE); BIND(LINEARSTUB); ! cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm br(LT, LINEAR_MEDIUM); mov(result, zr); RuntimeAddress stub = NULL; if (isL) { stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
*** 4575,4585 **** if (icnt1 == -1) { Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; ! cmp(cnt1, str1_isL == str2_isL ? 4 : 2); br(LT, DOSHORT); BIND(LINEAR_MEDIUM); (this->*str1_load_1chr)(first, Address(str1)); lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); --- 4575,4585 ---- if (icnt1 == -1) { Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; ! cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); br(LT, DOSHORT); BIND(LINEAR_MEDIUM); (this->*str1_load_1chr)(first, Address(str1)); lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
*** 4610,4620 **** br(LT, STR1_NEXT); b(MATCH); BIND(DOSHORT); if (str1_isL == str2_isL) { ! cmp(cnt1, 2); br(LT, DO1); br(GT, DO3); } } --- 4610,4620 ---- br(LT, STR1_NEXT); b(MATCH); BIND(DOSHORT); if (str1_isL == str2_isL) { ! cmp(cnt1, (u1)2); br(LT, DO1); br(GT, DO3); } }
*** 4685,4695 **** if (icnt1 == -1 || icnt1 == 1) { Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; BIND(DO1); (this->*str1_load_1chr)(ch1, str1); ! cmp(cnt2, 8); br(LT, DO1_SHORT); sub(result_tmp, cnt2, 8/str2_chr_size); sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); --- 4685,4695 ---- if (icnt1 == -1 || icnt1 == 1) { Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; BIND(DO1); (this->*str1_load_1chr)(ch1, str1); ! cmp(cnt2, (u1)8); br(LT, DO1_SHORT); sub(result_tmp, cnt2, 8/str2_chr_size); sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
*** 4708,4718 **** bics(tmp1, tmp1, tmp2); br(NE, HAS_ZERO); adds(cnt2_neg, cnt2_neg, 8); br(LT, CH1_LOOP); ! cmp(cnt2_neg, 8); mov(cnt2_neg, 0); br(LT, CH1_LOOP); b(NOMATCH); BIND(HAS_ZERO); --- 4708,4718 ---- bics(tmp1, tmp1, tmp2); br(NE, HAS_ZERO); adds(cnt2_neg, cnt2_neg, 8); br(LT, CH1_LOOP); ! cmp(cnt2_neg, (u1)8); mov(cnt2_neg, 0); br(LT, CH1_LOOP); b(NOMATCH); BIND(HAS_ZERO);
*** 4751,4761 **** Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; Register cnt1_neg = cnt1; Register ch1 = rscratch1; Register result_tmp = rscratch2; ! cmp(cnt1, 4); br(LT, DO1_SHORT); orr(ch, ch, ch, LSL, 16); orr(ch, ch, ch, LSL, 32); --- 4751,4761 ---- Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; Register cnt1_neg = cnt1; Register ch1 = rscratch1; Register result_tmp = rscratch2; ! cmp(cnt1, (u1)4); br(LT, DO1_SHORT); orr(ch, ch, ch, LSL, 16); orr(ch, ch, ch, LSL, 32);
*** 4774,4784 **** bics(tmp1, tmp1, tmp2); br(NE, HAS_ZERO); adds(cnt1_neg, cnt1_neg, 8); br(LT, CH1_LOOP); ! cmp(cnt1_neg, 8); mov(cnt1_neg, 0); br(LT, CH1_LOOP); b(NOMATCH); BIND(HAS_ZERO); --- 4774,4784 ---- bics(tmp1, tmp1, tmp2); br(NE, HAS_ZERO); adds(cnt1_neg, cnt1_neg, 8); br(LT, CH1_LOOP); ! cmp(cnt1_neg, (u1)8); mov(cnt1_neg, 0); br(LT, CH1_LOOP); b(NOMATCH); BIND(HAS_ZERO);
*** 4811,4821 **** FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START, TAIL_CHECK; ! const int STUB_THRESHOLD = 64 + 8; bool isLL = ae == StrIntrinsicNode::LL; bool isLU = ae == StrIntrinsicNode::LU; bool isUL = ae == StrIntrinsicNode::UL; bool str1_isL = isLL || isLU; --- 4811,4821 ---- FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START, TAIL_CHECK; ! const u1 STUB_THRESHOLD = 64 + 8; bool isLL = ae == StrIntrinsicNode::LL; bool isLU = ae == StrIntrinsicNode::LU; bool isUL = ae == StrIntrinsicNode::UL; bool str1_isL = isLL || isLU;
*** 5206,5219 **** ldrw(cnt1, Address(a1, length_offset)); cbz(a2, DONE); ldrw(cnt2, Address(a2, length_offset)); // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's // faster to perform another branch before comparing a1 and a2 ! cmp(cnt1, elem_per_word); br(LE, SHORT); // short or same ldr(tmp3, Address(pre(a1, base_offset))); ! cmp(cnt1, stubBytesThreshold); br(GE, STUB); ldr(tmp4, Address(pre(a2, base_offset))); sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); cmp(cnt2, cnt1); br(NE, DONE); --- 5206,5219 ---- ldrw(cnt1, Address(a1, length_offset)); cbz(a2, DONE); ldrw(cnt2, Address(a2, length_offset)); // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's // faster to perform another branch before comparing a1 and a2 ! cmp(cnt1, (u1)elem_per_word); br(LE, SHORT); // short or same ldr(tmp3, Address(pre(a1, base_offset))); ! subs(zr, cnt1, stubBytesThreshold); br(GE, STUB); ldr(tmp4, Address(pre(a2, base_offset))); sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); cmp(cnt2, cnt1); br(NE, DONE);
*** 5226,5236 **** br(LE, TAIL); eor(tmp4, tmp3, tmp4); cbnz(tmp4, DONE); ldr(tmp3, Address(pre(a1, wordSize))); ldr(tmp4, Address(pre(a2, wordSize))); ! cmp(cnt1, elem_per_word); br(LE, TAIL2); cmp(tmp1, tmp2); } br(EQ, NEXT_DWORD); b(DONE); --- 5226,5236 ---- br(LE, TAIL); eor(tmp4, tmp3, tmp4); cbnz(tmp4, DONE); ldr(tmp3, Address(pre(a1, wordSize))); ldr(tmp4, Address(pre(a2, wordSize))); ! cmp(cnt1, (u1)elem_per_word); br(LE, TAIL2); cmp(tmp1, tmp2); } br(EQ, NEXT_DWORD); b(DONE);
*** 5399,5409 **** { assert(is_power_of_2(zero_words_block_size), "adjust this"); assert(ptr == r10 && cnt == r11, "mismatch in register usage"); BLOCK_COMMENT("zero_words {"); ! cmp(cnt, zero_words_block_size); Label around, done, done16; br(LO, around); { RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); --- 5399,5409 ---- { assert(is_power_of_2(zero_words_block_size), "adjust this"); assert(ptr == r10 && cnt == r11, "mismatch in register usage"); BLOCK_COMMENT("zero_words {"); ! cmp(cnt, (u1)zero_words_block_size); Label around, done, done16; br(LO, around); { RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
*** 5580,5598 **** Register tmp1 = rscratch1, tmp2 = rscratch2; mov(result, len); // Save initial len #ifndef BUILTIN_SIM ! cmp(len, 8); // handle shortest strings first br(LT, LOOP_1); ! cmp(len, 32); br(LT, NEXT_8); // The following code uses the SIMD 'uzp1' and 'uzp2' instructions // to convert chars to bytes if (SoftwarePrefetchHintDistance >= 0) { ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); ! cmp(len, SoftwarePrefetchHintDistance/2 + 16); br(LE, NEXT_32_START); b(NEXT_32_PRFM_START); BIND(NEXT_32_PRFM); ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); BIND(NEXT_32_PRFM_START); --- 5580,5598 ---- Register tmp1 = rscratch1, tmp2 = rscratch2; mov(result, len); // Save initial len #ifndef BUILTIN_SIM ! cmp(len, (u1)8); // handle shortest strings first br(LT, LOOP_1); ! cmp(len, (u1)32); br(LT, NEXT_8); // The following code uses the SIMD 'uzp1' and 'uzp2' instructions // to convert chars to bytes if (SoftwarePrefetchHintDistance >= 0) { ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); ! subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); br(LE, NEXT_32_START); b(NEXT_32_PRFM_START); BIND(NEXT_32_PRFM); ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); BIND(NEXT_32_PRFM_START);
*** 5608,5620 **** orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); sub(len, len, 32); add(dst, dst, 32); add(src, src, 64); ! cmp(len, SoftwarePrefetchHintDistance/2 + 16); br(GE, NEXT_32_PRFM); ! cmp(len, 32); br(LT, LOOP_8); BIND(NEXT_32); ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); BIND(NEXT_32_START); } else { --- 5608,5620 ---- orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); sub(len, len, 32); add(dst, dst, 32); add(src, src, 64); ! subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); br(GE, NEXT_32_PRFM); ! cmp(len, (u1)32); br(LT, LOOP_8); BIND(NEXT_32); ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); BIND(NEXT_32_START); } else {
*** 5633,5648 **** orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); sub(len, len, 32); add(dst, dst, 32); add(src, src, 64); ! cmp(len, 32); br(GE, NEXT_32); cbz(len, DONE); BIND(LOOP_8); ! cmp(len, 8); br(LT, LOOP_1); BIND(NEXT_8); ld1(Vtmp1, T8H, src); uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes --- 5633,5648 ---- orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); sub(len, len, 32); add(dst, dst, 32); add(src, src, 64); ! cmp(len, (u1)32); br(GE, NEXT_32); cbz(len, DONE); BIND(LOOP_8); ! cmp(len, (u1)8); br(LT, LOOP_1); BIND(NEXT_8); ld1(Vtmp1, T8H, src); uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
*** 5651,5661 **** cbnz(tmp1, NEXT_1); sub(len, len, 8); add(dst, dst, 8); add(src, src, 16); ! cmp(len, 8); br(GE, NEXT_8); BIND(LOOP_1); #endif cbz(len, DONE); --- 5651,5661 ---- cbnz(tmp1, NEXT_1); sub(len, len, 8); add(dst, dst, 8); add(src, src, 16); ! cmp(len, (u1)8); br(GE, NEXT_8); BIND(LOOP_1); #endif cbz(len, DONE);
*** 5728,5738 **** if (SoftwarePrefetchHintDistance >= 0) { const int large_loop_threshold = (64 + 16)/8; ldrd(vtmp2, post(src, 8)); andw(len, len, 7); ! cmp(tmp4, large_loop_threshold); br(GE, to_stub); b(loop_start); bind(loop); ldrd(vtmp2, post(src, 8)); --- 5728,5738 ---- if (SoftwarePrefetchHintDistance >= 0) { const int large_loop_threshold = (64 + 16)/8; ldrd(vtmp2, post(src, 8)); andw(len, len, 7); ! cmp(tmp4, (u1)large_loop_threshold); br(GE, to_stub); b(loop_start); bind(loop); ldrd(vtmp2, post(src, 8));
< prev index next >