--- old/src/cpu/sparc/vm/macroAssembler_sparc.cpp 2016-04-19 18:34:26.000000000 -0700 +++ new/src/cpu/sparc/vm/macroAssembler_sparc.cpp 2016-04-19 18:34:26.000000000 -0700 @@ -4551,11 +4551,13 @@ void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, Register limit, Register tmp, Register result, bool is_byte) { - Label Ldone, Lword; + Label Ldone, Lword, Lmisaligned; assert_different_registers(ary1, ary2, limit, tmp, result); + Register tmp2 = result; // may be used as a temp also int length_offset = arrayOopDesc::length_offset_in_bytes(); int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); + const int short_length = 16; // at this size or smaller we don't bother to use 64-bit fetches if (is_array_equ) { // return true if the same array @@ -4595,32 +4597,142 @@ signx(limit); } + // Check for a short length (16 or less). + sub(limit, short_length+1, tmp2); // Check for doubleword (8 byte) alignment of ary1 and ary2 or3(ary1, ary2, tmp); - andcc(tmp, 7, tmp); + srax(tmp2, 63, tmp2); // = (limit<=16) ? -1 : 0 + and(tmp, 7, tmp); // = (ary1%8 | ary2%8) + or(tmp, tmp2, tmp); + br_notnull_short(tmp, Assembler::pn, Lword); // Aligned, perform doubleword comparison - array_equals_loop(ary1, ary2, limit, tmp, result, 8, Ldone); - ba(Ldone); - delayed()->movcc(Assembler::equal, false, xcc, 1, result); + array_equals_loop(ary1, ary2, limit, tmp, result, 8, &Ldone); bind(Lword); - // Unaligned, perform word comparison (word alignment is guaranteed) - array_equals_loop(ary1, ary2, limit, tmp, result, 4, Ldone); - movcc(Assembler::equal, false, icc, 1, result); + cmp_and_brx_short(tmp, 0, Assembler::greater, Assembler::pn, Lmisaligned); + + // Short count, perform word comparison (word alignment is guaranteed) + array_equals_loop(ary1, ary2, limit, tmp, result, 4, &Ldone); + + bind(Lmisaligned); + // Unaligned doubleword comparison (word alignment is guaranteed) + array_equals_loop(ary1, ary2, limit, tmp, result, 8+4, NULL); bind(Ldone); } // Compares two arrays in chunks of size 'byte_width'. The addresses must be aligned accordingly. void MacroAssembler::array_equals_loop(Register ary1, Register ary2, Register limit, Register tmp, - Register result, size_t byte_width, Label& Ldone) { - Label Lloop, Lremaining; + Register result, size_t byte_width, Label* Ldone_or_null) { + bool misaligned = (byte_width == 8+4); + if (misaligned) byte_width = 8; + assert(byte_width == 4 || byte_width == 8, ""); + + Register word1 = misaligned ? O7 : noreg; + assert_different_registers(ary1, ary2, limit, tmp, result, word1); + + Label Lloop, Lremaining, Lfallthrough; // Use appropriate CC register depending on byte_width Assembler::CC cc = (byte_width == 8) ? xcc : icc; + Label& Ldone = *((Ldone_or_null != NULL) ? Ldone_or_null : &Lfallthrough) + + if (misaligned) { + // Test for co-alignment. + Label Lswap, Lskewed, Lskloop, Lcleanup, Lfallthrough; + btst(4, ary2); + brx(Assembler::zero, false, Assembler::pn, Lskewed); + delayed()->load_sized_value(Address(ary1, 0), word1, byte_width/2, false); + // if ary2 is even, then assume ary1 is odd and start the loop right away + + // ary2 is odd, so what about ary1? + btst(4, ary1); + brx(Assembler::zero, false, Assembler::pn, Lswap); + delayed()->load_sized_value(Address(ary2, 0), result, byte_width/2, false); + + // Both are odd. Compare a common first word and go aligned. + cmp(result, word1); + // Check equality of elements + bp(Assembler::notEqual, false, cc, Assembler::pn, Ldone); + delayed()->clr(result); // not equal + + add(ary1, 4, ary1); + add(ary2, 4, ary2); + br(Assembler::always, false, Assembler::pt, Lfallthrough); + delayed()->sub(limit, 4, limit); + // Finish the loop in 64-bit chunks. + // (Caller is responsible to ensure that limit-4 is positive.) + + bind(Lswap); + mov(result, word1); // grab loaded half-word into correct register + // ary1 is odd and ary2 is even, so swap them + mov(ary1, tmp); + mov(ary2, ary1); + mov(tmp, ary2); + // and fall through to skewed loop + + bind(Lskewed); + // - ary1 is 4 (mod 8) + // - ary2 is 0 (mod 8) + // - word1 (low-order 32 bits) is ((int*)ary1)[0] + + // Shift ary1 and ary2 to the end of the arrays, negate limit + add(ary1, limit, ary1); + add(ary2, limit, ary2); + neg(limit, limit); + // Align ary1 by pushing it ahead of word1: + add(ary1, byte_width/2, ary1); + + bind(Lskloop); + // SKEWED MAIN LOOP + // Load and compare skewed array elements of size 8 until the elements are not + // equal or we reached the end of the arrays. Loop cleanup (in the case of + // a remainder of 1..7 bytes) is handled in common with the aligned loop. + sllx(word1, 32, result); // put word1 payload into MSW position + // we already have word1; now fetch word2 and word3 (in one 64-bit chunk) + { Register word23 = word1; // reuse temp locally + load_sized_value(Address(ary1, limit), word23, byte_width, false); + srlx(word23, 32, tmp); // put word2 payload into LSW position + or(result, tmp, result); // materialize *(unaligned long)(ary1+limit-4) = [word1|word2] + // and, the LSW of word1 (= word23) now contains word3, so we are good + } + load_sized_value(Address(ary2, limit), tmp, byte_width, false); + // To avoid accidents, ease out of this loop when we have 12 or fewer bytes left. + cmp_and_br_short(limit, -(byte_with*3/2), Assembler::greaterEqual, Assembler::pn, Lofframp); + cmp(result, tmp); + // Check equality of elements + bp(Assembler::equal, false, cc, Assembler::pt, target(Lskloop)); + delayed()->inc(limit, byte_width); + + ba(Ldone); + delayed()->clr(result); // not equal + + bind(Lofframp); + // limit is in the range [-12..-4], and there are 4..12 bytes left + inccc(limit, byte_width); + // Bail out immediately if there are 4..8 bytes left. + br(Assembler::positive, false, Assembler::pn, Lremaining); + delayed()->xorcc(tmp, result, tmp); + + // There are 9..12 bytes left, so first handle the final 64-bit chunk + bp(Assembler::notEqual, true, cc, Assembler::pt, Ldone); + delayed()->clr(result); // not equal + + // There are 1..4 bytes left now. + sllx(word1, 32, result); // put word1 payload into MSW position + load_sized_value(Address(ary2, limit), tmp, byte_width, false); + // No need to increment limit by 8; only the bottom 3 bits are significant. + ba(Lremaining); + delayed()->xorcc(tmp, result, tmp); + + bind(Lfallthrough); + // fall through to normal case + } + // Shift ary1 and ary2 to the end of the arrays, negate limit + // (Caller is responsible to ensure that limit starts out non-zero.) add(ary1, limit, ary1); add(ary2, limit, ary2); neg(limit, limit); @@ -4636,7 +4748,7 @@ inccc(limit, byte_width); // Bail out if we reached the end (but still do the comparison) br(Assembler::positive, false, Assembler::pn, Lremaining); - delayed()->cmp(result, tmp); + delayed()->xorcc(tmp, result, tmp); // Check equality of elements bp(Assembler::equal, false, cc, Assembler::pt, target(Lloop)); delayed()->load_sized_value(Address(ary1, limit), result, byte_width, false); @@ -4648,17 +4760,27 @@ // We got here because we reached the end of the arrays. 'limit' is the number of // garbage bytes we may have compared by reading over the end of the arrays. Shift // out the garbage and compare the remaining elements. + // The elements are pre-compared bitwise, in that result has been xored into tmp. + // Also, in the following code, only the low 2-3 bits of 'limit' are significant. + // A 'limit' value of either 0 or byte_width means "preserve all bits". bind(Lremaining); // Optimistic shortcut: elements potentially including garbage are equal bp(Assembler::equal, true, cc, Assembler::pt, target(Ldone)); delayed()->mov(1, result); // equal // Shift 'limit' bytes to the right and compare sll(limit, 3, limit); // bytes to bits - srlx(result, limit, result); - srlx(tmp, limit, tmp); - cmp(result, tmp); clr(result); - // CC register contains result + if (cc == icc) { + srl(tmp, limit, tmp); + } else { + srlx(tmp, limit, tmp); + } + if (&Ldone != &Lfallthrough) { + ba(Ldone); + delayed(); // ->movr(...) + } + movr(tmp, Assembler::rc_z, 1, result); // may be a delay slot instruction + bind(Lfallthrough); } void MacroAssembler::has_negatives(Register inp, Register size, Register result, Register t2, Register t3, Register t4, Register t5) {