src/cpu/sparc/vm/macroAssembler_sparc.cpp
Print this page
rev 10891 : tweak array comparison loops
@@ -4549,15 +4549,17 @@
}
}
void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
Register limit, Register tmp, Register result, bool is_byte) {
- Label Ldone, Lword;
+ Label Ldone, Lword, Lmisaligned;
assert_different_registers(ary1, ary2, limit, tmp, result);
+ Register tmp2 = result; // may be used as a temp also
int length_offset = arrayOopDesc::length_offset_in_bytes();
int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
+ const int short_length = 16; // at this size or smaller we don't bother to use 64-bit fetches
if (is_array_equ) {
// return true if the same array
cmp(ary1, ary2);
brx(Assembler::equal, true, Assembler::pn, Ldone);
@@ -4593,39 +4595,149 @@
} else {
// We have no guarantee that on 64 bit the higher half of limit is 0
signx(limit);
}
+ // Check for a short length (16 or less).
+ sub(limit, short_length+1, tmp2);
// Check for doubleword (8 byte) alignment of ary1 and ary2
or3(ary1, ary2, tmp);
- andcc(tmp, 7, tmp);
+ srax(tmp2, 63, tmp2); // = (limit<=16) ? -1 : 0
+ and(tmp, 7, tmp); // = (ary1%8 | ary2%8)
+ or(tmp, tmp2, tmp);
+
br_notnull_short(tmp, Assembler::pn, Lword);
// Aligned, perform doubleword comparison
- array_equals_loop(ary1, ary2, limit, tmp, result, 8, Ldone);
- ba(Ldone);
- delayed()->movcc(Assembler::equal, false, xcc, 1, result);
+ array_equals_loop(ary1, ary2, limit, tmp, result, 8, &Ldone);
bind(Lword);
- // Unaligned, perform word comparison (word alignment is guaranteed)
- array_equals_loop(ary1, ary2, limit, tmp, result, 4, Ldone);
- movcc(Assembler::equal, false, icc, 1, result);
+ cmp_and_brx_short(tmp, 0, Assembler::greater, Assembler::pn, Lmisaligned);
+
+ // Short count, perform word comparison (word alignment is guaranteed)
+ array_equals_loop(ary1, ary2, limit, tmp, result, 4, &Ldone);
+
+ bind(Lmisaligned);
+ // Unaligned doubleword comparison (word alignment is guaranteed)
+ array_equals_loop(ary1, ary2, limit, tmp, result, 8+4, NULL);
bind(Ldone);
}
// Compares two arrays in chunks of size 'byte_width'. The addresses must be aligned accordingly.
void MacroAssembler::array_equals_loop(Register ary1, Register ary2, Register limit, Register tmp,
- Register result, size_t byte_width, Label& Ldone) {
- Label Lloop, Lremaining;
+ Register result, size_t byte_width, Label* Ldone_or_null) {
+ bool misaligned = (byte_width == 8+4);
+ if (misaligned) byte_width = 8;
+ assert(byte_width == 4 || byte_width == 8, "");
+
+ Register word1 = misaligned ? O7 : noreg;
+ assert_different_registers(ary1, ary2, limit, tmp, result, word1);
+
+ Label Lloop, Lremaining, Lfallthrough;
// Use appropriate CC register depending on byte_width
Assembler::CC cc = (byte_width == 8) ? xcc : icc;
+ Label& Ldone = *((Ldone_or_null != NULL) ? Ldone_or_null : &Lfallthrough)
+
+ if (misaligned) {
+ // Test for co-alignment.
+ Label Lswap, Lskewed, Lskloop, Lcleanup, Lfallthrough;
+ btst(4, ary2);
+ brx(Assembler::zero, false, Assembler::pn, Lskewed);
+ delayed()->load_sized_value(Address(ary1, 0), word1, byte_width/2, false);
+ // if ary2 is even, then assume ary1 is odd and start the loop right away
+
+ // ary2 is odd, so what about ary1?
+ btst(4, ary1);
+ brx(Assembler::zero, false, Assembler::pn, Lswap);
+ delayed()->load_sized_value(Address(ary2, 0), result, byte_width/2, false);
+
+ // Both are odd. Compare a common first word and go aligned.
+ cmp(result, word1);
+ // Check equality of elements
+ bp(Assembler::notEqual, false, cc, Assembler::pn, Ldone);
+ delayed()->clr(result); // not equal
+
+ add(ary1, 4, ary1);
+ add(ary2, 4, ary2);
+ br(Assembler::always, false, Assembler::pt, Lfallthrough);
+ delayed()->sub(limit, 4, limit);
+ // Finish the loop in 64-bit chunks.
+ // (Caller is responsible to ensure that limit-4 is positive.)
+
+ bind(Lswap);
+ mov(result, word1); // grab loaded half-word into correct register
+ // ary1 is odd and ary2 is even, so swap them
+ mov(ary1, tmp);
+ mov(ary2, ary1);
+ mov(tmp, ary2);
+ // and fall through to skewed loop
+
+ bind(Lskewed);
+ // - ary1 is 4 (mod 8)
+ // - ary2 is 0 (mod 8)
+ // - word1 (low-order 32 bits) is ((int*)ary1)[0]
+
// Shift ary1 and ary2 to the end of the arrays, negate limit
add(ary1, limit, ary1);
add(ary2, limit, ary2);
neg(limit, limit);
+ // Align ary1 by pushing it ahead of word1:
+ add(ary1, byte_width/2, ary1);
+
+ bind(Lskloop);
+ // SKEWED MAIN LOOP
+ // Load and compare skewed array elements of size 8 until the elements are not
+ // equal or we reached the end of the arrays. Loop cleanup (in the case of
+ // a remainder of 1..7 bytes) is handled in common with the aligned loop.
+ sllx(word1, 32, result); // put word1 payload into MSW position
+ // we already have word1; now fetch word2 and word3 (in one 64-bit chunk)
+ { Register word23 = word1; // reuse temp locally
+ load_sized_value(Address(ary1, limit), word23, byte_width, false);
+ srlx(word23, 32, tmp); // put word2 payload into LSW position
+ or(result, tmp, result); // materialize *(unaligned long)(ary1+limit-4) = [word1|word2]
+ // and, the LSW of word1 (= word23) now contains word3, so we are good
+ }
+ load_sized_value(Address(ary2, limit), tmp, byte_width, false);
+ // To avoid accidents, ease out of this loop when we have 12 or fewer bytes left.
+ cmp_and_br_short(limit, -(byte_with*3/2), Assembler::greaterEqual, Assembler::pn, Lofframp);
+ cmp(result, tmp);
+ // Check equality of elements
+ bp(Assembler::equal, false, cc, Assembler::pt, target(Lskloop));
+ delayed()->inc(limit, byte_width);
+
+ ba(Ldone);
+ delayed()->clr(result); // not equal
+
+ bind(Lofframp);
+ // limit is in the range [-12..-4], and there are 4..12 bytes left
+ inccc(limit, byte_width);
+ // Bail out immediately if there are 4..8 bytes left.
+ br(Assembler::positive, false, Assembler::pn, Lremaining);
+ delayed()->xorcc(tmp, result, tmp);
+
+ // There are 9..12 bytes left, so first handle the final 64-bit chunk
+ bp(Assembler::notEqual, true, cc, Assembler::pt, Ldone);
+ delayed()->clr(result); // not equal
+
+ // There are 1..4 bytes left now.
+ sllx(word1, 32, result); // put word1 payload into MSW position
+ load_sized_value(Address(ary2, limit), tmp, byte_width, false);
+ // No need to increment limit by 8; only the bottom 3 bits are significant.
+ ba(Lremaining);
+ delayed()->xorcc(tmp, result, tmp);
+
+ bind(Lfallthrough);
+ // fall through to normal case
+ }
+
+ // Shift ary1 and ary2 to the end of the arrays, negate limit
+ // (Caller is responsible to ensure that limit starts out non-zero.)
+ add(ary1, limit, ary1);
+ add(ary2, limit, ary2);
+ neg(limit, limit);
// MAIN LOOP
// Load and compare array elements of size 'byte_width' until the elements are not
// equal or we reached the end of the arrays. If the size of the arrays is not a
// multiple of 'byte_width', we simply read over the end of the array, bail out and
@@ -4634,11 +4746,11 @@
bind(Lloop);
load_sized_value(Address(ary2, limit), tmp, byte_width, false);
inccc(limit, byte_width);
// Bail out if we reached the end (but still do the comparison)
br(Assembler::positive, false, Assembler::pn, Lremaining);
- delayed()->cmp(result, tmp);
+ delayed()->xorcc(tmp, result, tmp);
// Check equality of elements
bp(Assembler::equal, false, cc, Assembler::pt, target(Lloop));
delayed()->load_sized_value(Address(ary1, limit), result, byte_width, false);
ba(Ldone);
@@ -4646,21 +4758,31 @@
// TAIL COMPARISON
// We got here because we reached the end of the arrays. 'limit' is the number of
// garbage bytes we may have compared by reading over the end of the arrays. Shift
// out the garbage and compare the remaining elements.
+ // The elements are pre-compared bitwise, in that result has been xored into tmp.
+ // Also, in the following code, only the low 2-3 bits of 'limit' are significant.
+ // A 'limit' value of either 0 or byte_width means "preserve all bits".
bind(Lremaining);
// Optimistic shortcut: elements potentially including garbage are equal
bp(Assembler::equal, true, cc, Assembler::pt, target(Ldone));
delayed()->mov(1, result); // equal
// Shift 'limit' bytes to the right and compare
sll(limit, 3, limit); // bytes to bits
- srlx(result, limit, result);
- srlx(tmp, limit, tmp);
- cmp(result, tmp);
clr(result);
- // CC register contains result
+ if (cc == icc) {
+ srl(tmp, limit, tmp);
+ } else {
+ srlx(tmp, limit, tmp);
+ }
+ if (&Ldone != &Lfallthrough) {
+ ba(Ldone);
+ delayed(); // ->movr(...)
+ }
+ movr(tmp, Assembler::rc_z, 1, result); // may be a delay slot instruction
+ bind(Lfallthrough);
}
void MacroAssembler::has_negatives(Register inp, Register size, Register result, Register t2, Register t3, Register t4, Register t5) {
// test for negative bytes in input string of a given size