< prev index next >
src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
Print this page
*** 492,502 ****
if (!swap_reg_contains_mark) {
null_check_offset = offset();
ldr(swap_reg, mark_addr);
}
andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
! cmp(tmp_reg, markOopDesc::biased_lock_pattern);
br(Assembler::NE, cas_label);
// The bias pattern is present in the object's header. Need to check
// whether the bias owner and the epoch are both still current.
load_prototype_header(tmp_reg, obj_reg);
orr(tmp_reg, tmp_reg, rthread);
--- 492,502 ----
if (!swap_reg_contains_mark) {
null_check_offset = offset();
ldr(swap_reg, mark_addr);
}
andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
! cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
br(Assembler::NE, cas_label);
// The bias pattern is present in the object's header. Need to check
// whether the bias owner and the epoch are both still current.
load_prototype_header(tmp_reg, obj_reg);
orr(tmp_reg, tmp_reg, rthread);
*** 631,641 ****
// a higher level. Second, if the bias was revoked while we held the
// lock, the object could not be rebiased toward another thread, so
// the bias bit would be clear.
ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
! cmp(temp_reg, markOopDesc::biased_lock_pattern);
br(Assembler::EQ, done);
}
static void pass_arg0(MacroAssembler* masm, Register arg) {
if (c_rarg0 != arg ) {
--- 631,641 ----
// a higher level. Second, if the bias was revoked while we held the
// lock, the object could not be rebiased toward another thread, so
// the bias bit would be clear.
ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
! cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
br(Assembler::EQ, done);
}
static void pass_arg0(MacroAssembler* masm, Register arg) {
if (c_rarg0 != arg ) {
*** 1127,1137 ****
// So if it was a primary super, we can just fail immediately.
// Otherwise, it's the slow path for us (no success at this point).
if (super_check_offset.is_register()) {
br(Assembler::EQ, *L_success);
! cmp(super_check_offset.as_register(), sc_offset);
if (L_failure == &L_fallthrough) {
br(Assembler::EQ, *L_slow_path);
} else {
br(Assembler::NE, *L_failure);
final_jmp(*L_slow_path);
--- 1127,1137 ----
// So if it was a primary super, we can just fail immediately.
// Otherwise, it's the slow path for us (no success at this point).
if (super_check_offset.is_register()) {
br(Assembler::EQ, *L_success);
! subs(zr, super_check_offset.as_register(), sc_offset);
if (L_failure == &L_fallthrough) {
br(Assembler::EQ, *L_slow_path);
} else {
br(Assembler::NE, *L_failure);
final_jmp(*L_slow_path);
*** 3302,3312 ****
add(table1, table0, 1*256*sizeof(juint));
add(table2, table0, 2*256*sizeof(juint));
add(table3, table0, 3*256*sizeof(juint));
if (UseNeon) {
! cmp(len, 64);
br(Assembler::LT, L_by16);
eor(v16, T16B, v16, v16);
Label L_fold;
--- 3302,3312 ----
add(table1, table0, 1*256*sizeof(juint));
add(table2, table0, 2*256*sizeof(juint));
add(table3, table0, 3*256*sizeof(juint));
if (UseNeon) {
! cmp(len, (u1)64);
br(Assembler::LT, L_by16);
eor(v16, T16B, v16, v16);
Label L_fold;
*** 4352,4365 ****
// For larger pattern and source we use a simplified Boyer Moore algorithm.
// With a small pattern and source we use linear scan.
if (icnt1 == -1) {
sub(result_tmp, cnt2, cnt1);
! cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
br(LT, LINEARSEARCH);
dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
! cmp(cnt1, 256);
lsr(tmp1, cnt2, 2);
ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
br(GE, LINEARSTUB);
}
--- 4352,4365 ----
// For larger pattern and source we use a simplified Boyer Moore algorithm.
// With a small pattern and source we use linear scan.
if (icnt1 == -1) {
sub(result_tmp, cnt2, cnt1);
! cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
br(LT, LINEARSEARCH);
dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
! subs(zr, cnt1, 256);
lsr(tmp1, cnt2, 2);
ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
br(GE, LINEARSTUB);
}
*** 4461,4471 ****
sub(ch2, cnt1, 1);
mov(tmp3, str1);
BIND(BCLOOP);
(this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
if (!str1_isL) {
! cmp(ch1, ASIZE);
br(HS, BCSKIP);
}
strb(ch2, Address(sp, ch1));
BIND(BCSKIP);
subs(ch2, ch2, 1);
--- 4461,4471 ----
sub(ch2, cnt1, 1);
mov(tmp3, str1);
BIND(BCLOOP);
(this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
if (!str1_isL) {
! subs(zr, ch1, ASIZE);
br(HS, BCSKIP);
}
strb(ch2, Address(sp, ch1));
BIND(BCSKIP);
subs(ch2, ch2, 1);
*** 4525,4535 ****
if (str1_isL != str2_isL) {
mov(result_tmp, cnt1);
} else {
mov(result_tmp, 1);
}
! cmp(skipch, ASIZE);
br(HS, BMADV);
}
ldrb(result_tmp, Address(sp, skipch)); // load skip distance
BIND(BMADV);
sub(cnt1tmp, cnt1, 1);
--- 4525,4535 ----
if (str1_isL != str2_isL) {
mov(result_tmp, cnt1);
} else {
mov(result_tmp, 1);
}
! subs(zr, skipch, ASIZE);
br(HS, BMADV);
}
ldrb(result_tmp, Address(sp, skipch)); // load skip distance
BIND(BMADV);
sub(cnt1tmp, cnt1, 1);
*** 4546,4556 ****
if (!str2_isL) lsr(result, result, 1);
add(sp, sp, ASIZE);
b(DONE);
BIND(LINEARSTUB);
! cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
br(LT, LINEAR_MEDIUM);
mov(result, zr);
RuntimeAddress stub = NULL;
if (isL) {
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
--- 4546,4556 ----
if (!str2_isL) lsr(result, result, 1);
add(sp, sp, ASIZE);
b(DONE);
BIND(LINEARSTUB);
! cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
br(LT, LINEAR_MEDIUM);
mov(result, zr);
RuntimeAddress stub = NULL;
if (isL) {
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
*** 4575,4585 ****
if (icnt1 == -1)
{
Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
! cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
br(LT, DOSHORT);
BIND(LINEAR_MEDIUM);
(this->*str1_load_1chr)(first, Address(str1));
lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
--- 4575,4585 ----
if (icnt1 == -1)
{
Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
! cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
br(LT, DOSHORT);
BIND(LINEAR_MEDIUM);
(this->*str1_load_1chr)(first, Address(str1));
lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
*** 4610,4620 ****
br(LT, STR1_NEXT);
b(MATCH);
BIND(DOSHORT);
if (str1_isL == str2_isL) {
! cmp(cnt1, 2);
br(LT, DO1);
br(GT, DO3);
}
}
--- 4610,4620 ----
br(LT, STR1_NEXT);
b(MATCH);
BIND(DOSHORT);
if (str1_isL == str2_isL) {
! cmp(cnt1, (u1)2);
br(LT, DO1);
br(GT, DO3);
}
}
*** 4685,4695 ****
if (icnt1 == -1 || icnt1 == 1) {
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
BIND(DO1);
(this->*str1_load_1chr)(ch1, str1);
! cmp(cnt2, 8);
br(LT, DO1_SHORT);
sub(result_tmp, cnt2, 8/str2_chr_size);
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
--- 4685,4695 ----
if (icnt1 == -1 || icnt1 == 1) {
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
BIND(DO1);
(this->*str1_load_1chr)(ch1, str1);
! cmp(cnt2, (u1)8);
br(LT, DO1_SHORT);
sub(result_tmp, cnt2, 8/str2_chr_size);
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
*** 4708,4718 ****
bics(tmp1, tmp1, tmp2);
br(NE, HAS_ZERO);
adds(cnt2_neg, cnt2_neg, 8);
br(LT, CH1_LOOP);
! cmp(cnt2_neg, 8);
mov(cnt2_neg, 0);
br(LT, CH1_LOOP);
b(NOMATCH);
BIND(HAS_ZERO);
--- 4708,4718 ----
bics(tmp1, tmp1, tmp2);
br(NE, HAS_ZERO);
adds(cnt2_neg, cnt2_neg, 8);
br(LT, CH1_LOOP);
! cmp(cnt2_neg, (u1)8);
mov(cnt2_neg, 0);
br(LT, CH1_LOOP);
b(NOMATCH);
BIND(HAS_ZERO);
*** 4751,4761 ****
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
Register cnt1_neg = cnt1;
Register ch1 = rscratch1;
Register result_tmp = rscratch2;
! cmp(cnt1, 4);
br(LT, DO1_SHORT);
orr(ch, ch, ch, LSL, 16);
orr(ch, ch, ch, LSL, 32);
--- 4751,4761 ----
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
Register cnt1_neg = cnt1;
Register ch1 = rscratch1;
Register result_tmp = rscratch2;
! cmp(cnt1, (u1)4);
br(LT, DO1_SHORT);
orr(ch, ch, ch, LSL, 16);
orr(ch, ch, ch, LSL, 32);
*** 4774,4784 ****
bics(tmp1, tmp1, tmp2);
br(NE, HAS_ZERO);
adds(cnt1_neg, cnt1_neg, 8);
br(LT, CH1_LOOP);
! cmp(cnt1_neg, 8);
mov(cnt1_neg, 0);
br(LT, CH1_LOOP);
b(NOMATCH);
BIND(HAS_ZERO);
--- 4774,4784 ----
bics(tmp1, tmp1, tmp2);
br(NE, HAS_ZERO);
adds(cnt1_neg, cnt1_neg, 8);
br(LT, CH1_LOOP);
! cmp(cnt1_neg, (u1)8);
mov(cnt1_neg, 0);
br(LT, CH1_LOOP);
b(NOMATCH);
BIND(HAS_ZERO);
*** 4811,4821 ****
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
SHORT_LOOP_START, TAIL_CHECK;
! const int STUB_THRESHOLD = 64 + 8;
bool isLL = ae == StrIntrinsicNode::LL;
bool isLU = ae == StrIntrinsicNode::LU;
bool isUL = ae == StrIntrinsicNode::UL;
bool str1_isL = isLL || isLU;
--- 4811,4821 ----
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
SHORT_LOOP_START, TAIL_CHECK;
! const u1 STUB_THRESHOLD = 64 + 8;
bool isLL = ae == StrIntrinsicNode::LL;
bool isLU = ae == StrIntrinsicNode::LU;
bool isUL = ae == StrIntrinsicNode::UL;
bool str1_isL = isLL || isLU;
*** 5206,5219 ****
ldrw(cnt1, Address(a1, length_offset));
cbz(a2, DONE);
ldrw(cnt2, Address(a2, length_offset));
// on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
// faster to perform another branch before comparing a1 and a2
! cmp(cnt1, elem_per_word);
br(LE, SHORT); // short or same
ldr(tmp3, Address(pre(a1, base_offset)));
! cmp(cnt1, stubBytesThreshold);
br(GE, STUB);
ldr(tmp4, Address(pre(a2, base_offset)));
sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
cmp(cnt2, cnt1);
br(NE, DONE);
--- 5206,5219 ----
ldrw(cnt1, Address(a1, length_offset));
cbz(a2, DONE);
ldrw(cnt2, Address(a2, length_offset));
// on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
// faster to perform another branch before comparing a1 and a2
! cmp(cnt1, (u1)elem_per_word);
br(LE, SHORT); // short or same
ldr(tmp3, Address(pre(a1, base_offset)));
! subs(zr, cnt1, stubBytesThreshold);
br(GE, STUB);
ldr(tmp4, Address(pre(a2, base_offset)));
sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
cmp(cnt2, cnt1);
br(NE, DONE);
*** 5226,5236 ****
br(LE, TAIL);
eor(tmp4, tmp3, tmp4);
cbnz(tmp4, DONE);
ldr(tmp3, Address(pre(a1, wordSize)));
ldr(tmp4, Address(pre(a2, wordSize)));
! cmp(cnt1, elem_per_word);
br(LE, TAIL2);
cmp(tmp1, tmp2);
} br(EQ, NEXT_DWORD);
b(DONE);
--- 5226,5236 ----
br(LE, TAIL);
eor(tmp4, tmp3, tmp4);
cbnz(tmp4, DONE);
ldr(tmp3, Address(pre(a1, wordSize)));
ldr(tmp4, Address(pre(a2, wordSize)));
! cmp(cnt1, (u1)elem_per_word);
br(LE, TAIL2);
cmp(tmp1, tmp2);
} br(EQ, NEXT_DWORD);
b(DONE);
*** 5399,5409 ****
{
assert(is_power_of_2(zero_words_block_size), "adjust this");
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
BLOCK_COMMENT("zero_words {");
! cmp(cnt, zero_words_block_size);
Label around, done, done16;
br(LO, around);
{
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
--- 5399,5409 ----
{
assert(is_power_of_2(zero_words_block_size), "adjust this");
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
BLOCK_COMMENT("zero_words {");
! cmp(cnt, (u1)zero_words_block_size);
Label around, done, done16;
br(LO, around);
{
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
*** 5580,5598 ****
Register tmp1 = rscratch1, tmp2 = rscratch2;
mov(result, len); // Save initial len
#ifndef BUILTIN_SIM
! cmp(len, 8); // handle shortest strings first
br(LT, LOOP_1);
! cmp(len, 32);
br(LT, NEXT_8);
// The following code uses the SIMD 'uzp1' and 'uzp2' instructions
// to convert chars to bytes
if (SoftwarePrefetchHintDistance >= 0) {
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
! cmp(len, SoftwarePrefetchHintDistance/2 + 16);
br(LE, NEXT_32_START);
b(NEXT_32_PRFM_START);
BIND(NEXT_32_PRFM);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32_PRFM_START);
--- 5580,5598 ----
Register tmp1 = rscratch1, tmp2 = rscratch2;
mov(result, len); // Save initial len
#ifndef BUILTIN_SIM
! cmp(len, (u1)8); // handle shortest strings first
br(LT, LOOP_1);
! cmp(len, (u1)32);
br(LT, NEXT_8);
// The following code uses the SIMD 'uzp1' and 'uzp2' instructions
// to convert chars to bytes
if (SoftwarePrefetchHintDistance >= 0) {
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
! subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
br(LE, NEXT_32_START);
b(NEXT_32_PRFM_START);
BIND(NEXT_32_PRFM);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32_PRFM_START);
*** 5608,5620 ****
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64);
! cmp(len, SoftwarePrefetchHintDistance/2 + 16);
br(GE, NEXT_32_PRFM);
! cmp(len, 32);
br(LT, LOOP_8);
BIND(NEXT_32);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32_START);
} else {
--- 5608,5620 ----
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64);
! subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
br(GE, NEXT_32_PRFM);
! cmp(len, (u1)32);
br(LT, LOOP_8);
BIND(NEXT_32);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32_START);
} else {
*** 5633,5648 ****
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64);
! cmp(len, 32);
br(GE, NEXT_32);
cbz(len, DONE);
BIND(LOOP_8);
! cmp(len, 8);
br(LT, LOOP_1);
BIND(NEXT_8);
ld1(Vtmp1, T8H, src);
uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
--- 5633,5648 ----
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64);
! cmp(len, (u1)32);
br(GE, NEXT_32);
cbz(len, DONE);
BIND(LOOP_8);
! cmp(len, (u1)8);
br(LT, LOOP_1);
BIND(NEXT_8);
ld1(Vtmp1, T8H, src);
uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
*** 5651,5661 ****
cbnz(tmp1, NEXT_1);
sub(len, len, 8);
add(dst, dst, 8);
add(src, src, 16);
! cmp(len, 8);
br(GE, NEXT_8);
BIND(LOOP_1);
#endif
cbz(len, DONE);
--- 5651,5661 ----
cbnz(tmp1, NEXT_1);
sub(len, len, 8);
add(dst, dst, 8);
add(src, src, 16);
! cmp(len, (u1)8);
br(GE, NEXT_8);
BIND(LOOP_1);
#endif
cbz(len, DONE);
*** 5728,5738 ****
if (SoftwarePrefetchHintDistance >= 0) {
const int large_loop_threshold = (64 + 16)/8;
ldrd(vtmp2, post(src, 8));
andw(len, len, 7);
! cmp(tmp4, large_loop_threshold);
br(GE, to_stub);
b(loop_start);
bind(loop);
ldrd(vtmp2, post(src, 8));
--- 5728,5738 ----
if (SoftwarePrefetchHintDistance >= 0) {
const int large_loop_threshold = (64 + 16)/8;
ldrd(vtmp2, post(src, 8));
andw(len, len, 7);
! cmp(tmp4, (u1)large_loop_threshold);
br(GE, to_stub);
b(loop_start);
bind(loop);
ldrd(vtmp2, post(src, 8));
< prev index next >