--- old/src/cpu/sparc/vm/macroAssembler_sparc.cpp	2016-04-19 18:34:26.000000000 -0700
+++ new/src/cpu/sparc/vm/macroAssembler_sparc.cpp	2016-04-19 18:34:26.000000000 -0700
@@ -4551,11 +4551,13 @@
 
 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
                                   Register limit, Register tmp, Register result, bool is_byte) {
-  Label Ldone, Lword;
+  Label Ldone, Lword, Lmisaligned;
   assert_different_registers(ary1, ary2, limit, tmp, result);
+  Register tmp2 = result; // may be used as a temp also
 
   int length_offset  = arrayOopDesc::length_offset_in_bytes();
   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
+  const int short_length = 16;  // at this size or smaller we don't bother to use 64-bit fetches
 
   if (is_array_equ) {
     // return true if the same array
@@ -4595,32 +4597,142 @@
     signx(limit);
   }
 
+  // Check for a short length (16 or less).
+  sub(limit, short_length+1, tmp2);
   // Check for doubleword (8 byte) alignment of ary1 and ary2
   or3(ary1, ary2, tmp);
-  andcc(tmp, 7, tmp);
+  srax(tmp2, 63, tmp2);  // = (limit<=16) ? -1 : 0
+  and(tmp, 7, tmp);      // = (ary1%8 | ary2%8)
+  or(tmp, tmp2, tmp);
+
   br_notnull_short(tmp, Assembler::pn, Lword);
 
   // Aligned, perform doubleword comparison
-  array_equals_loop(ary1, ary2, limit, tmp, result, 8, Ldone);
-  ba(Ldone);
-  delayed()->movcc(Assembler::equal, false, xcc, 1, result);
+  array_equals_loop(ary1, ary2, limit, tmp, result, 8, &Ldone);
 
   bind(Lword);
-  // Unaligned, perform word comparison (word alignment is guaranteed)
-  array_equals_loop(ary1, ary2, limit, tmp, result, 4, Ldone);
-  movcc(Assembler::equal, false, icc, 1, result);
+  cmp_and_brx_short(tmp, 0, Assembler::greater, Assembler::pn, Lmisaligned);
+  
+  // Short count, perform word comparison (word alignment is guaranteed)
+  array_equals_loop(ary1, ary2, limit, tmp, result, 4, &Ldone);
+
+  bind(Lmisaligned);
+  // Unaligned doubleword comparison (word alignment is guaranteed)
+  array_equals_loop(ary1, ary2, limit, tmp, result, 8+4, NULL);
 
   bind(Ldone);
 }
 
 // Compares two arrays in chunks of size 'byte_width'. The addresses must be aligned accordingly.
 void MacroAssembler::array_equals_loop(Register ary1, Register ary2, Register limit, Register tmp,
-                                       Register result, size_t byte_width, Label& Ldone) {
-  Label Lloop, Lremaining;
+                                       Register result, size_t byte_width, Label* Ldone_or_null) {
+  bool misaligned = (byte_width == 8+4);
+  if (misaligned)  byte_width = 8;
+  assert(byte_width == 4 || byte_width == 8, "");
+
+  Register word1 = misaligned ? O7 : noreg;
+  assert_different_registers(ary1, ary2, limit, tmp, result, word1);
+
+  Label Lloop, Lremaining, Lfallthrough;
   // Use appropriate CC register depending on byte_width
   Assembler::CC cc = (byte_width == 8) ? xcc : icc;
 
+  Label& Ldone = *((Ldone_or_null != NULL) ? Ldone_or_null : &Lfallthrough)
+
+  if (misaligned) {
+    // Test for co-alignment.
+    Label Lswap, Lskewed, Lskloop, Lcleanup, Lfallthrough;
+    btst(4, ary2);
+    brx(Assembler::zero, false, Assembler::pn, Lskewed);
+    delayed()->load_sized_value(Address(ary1, 0), word1, byte_width/2, false);
+    // if ary2 is even, then assume ary1 is odd and start the loop right away
+
+    // ary2 is odd, so what about ary1?
+    btst(4, ary1);
+    brx(Assembler::zero, false, Assembler::pn, Lswap);
+    delayed()->load_sized_value(Address(ary2, 0), result, byte_width/2, false);
+
+    // Both are odd.  Compare a common first word and go aligned.
+    cmp(result, word1);
+    // Check equality of elements
+    bp(Assembler::notEqual, false, cc, Assembler::pn, Ldone);
+    delayed()->clr(result); // not equal
+
+    add(ary1, 4, ary1);
+    add(ary2, 4, ary2);
+    br(Assembler::always, false, Assembler::pt, Lfallthrough);
+    delayed()->sub(limit, 4, limit);
+    // Finish the loop in 64-bit chunks.
+    // (Caller is responsible to ensure that limit-4 is positive.)
+
+    bind(Lswap);
+    mov(result, word1);  // grab loaded half-word into correct register
+    // ary1 is odd and ary2 is even, so swap them
+    mov(ary1, tmp);
+    mov(ary2, ary1);
+    mov(tmp, ary2);
+    // and fall through to skewed loop
+
+    bind(Lskewed);
+    // - ary1 is 4 (mod 8)
+    // - ary2 is 0 (mod 8)
+    // - word1 (low-order 32 bits) is ((int*)ary1)[0]
+
+    // Shift ary1 and ary2 to the end of the arrays, negate limit
+    add(ary1, limit, ary1);
+    add(ary2, limit, ary2);
+    neg(limit, limit);
+    // Align ary1 by pushing it ahead of word1:
+    add(ary1, byte_width/2, ary1);
+
+    bind(Lskloop);
+    // SKEWED MAIN LOOP
+    // Load and compare skewed array elements of size 8 until the elements are not
+    // equal or we reached the end of the arrays.  Loop cleanup (in the case of
+    // a remainder of 1..7 bytes) is handled in common with the aligned loop.
+    sllx(word1, 32, result);  // put word1 payload into MSW position
+    // we already have word1; now fetch word2 and word3 (in one 64-bit chunk)
+    { Register word23 = word1;  // reuse temp locally
+      load_sized_value(Address(ary1, limit), word23, byte_width, false);
+      srlx(word23, 32, tmp);  // put word2 payload into LSW position
+      or(result, tmp, result);  // materialize *(unaligned long)(ary1+limit-4) = [word1|word2]
+      // and, the LSW of word1 (= word23) now contains word3, so we are good
+    }
+    load_sized_value(Address(ary2, limit), tmp, byte_width, false);
+    // To avoid accidents, ease out of this loop when we have 12 or fewer bytes left.
+    cmp_and_br_short(limit, -(byte_with*3/2), Assembler::greaterEqual, Assembler::pn, Lofframp);
+    cmp(result, tmp);
+    // Check equality of elements
+    bp(Assembler::equal, false, cc, Assembler::pt, target(Lskloop));
+    delayed()->inc(limit, byte_width);
+
+    ba(Ldone);
+    delayed()->clr(result); // not equal
+
+    bind(Lofframp);
+    // limit is in the range [-12..-4], and there are 4..12 bytes left 
+    inccc(limit, byte_width);
+    // Bail out immediately if there are 4..8 bytes left.
+    br(Assembler::positive, false, Assembler::pn, Lremaining);
+    delayed()->xorcc(tmp, result, tmp);
+
+    // There are 9..12 bytes left, so first handle the final 64-bit chunk
+    bp(Assembler::notEqual, true, cc, Assembler::pt, Ldone);
+    delayed()->clr(result); // not equal
+
+    // There are 1..4 bytes left now.
+    sllx(word1, 32, result);  // put word1 payload into MSW position
+    load_sized_value(Address(ary2, limit), tmp, byte_width, false);
+    // No need to increment limit by 8; only the bottom 3 bits are significant.
+    ba(Lremaining);
+    delayed()->xorcc(tmp, result, tmp);
+
+    bind(Lfallthrough);
+    // fall through to normal case
+  }
+
   // Shift ary1 and ary2 to the end of the arrays, negate limit
+  // (Caller is responsible to ensure that limit starts out non-zero.)
   add(ary1, limit, ary1);
   add(ary2, limit, ary2);
   neg(limit, limit);
@@ -4636,7 +4748,7 @@
   inccc(limit, byte_width);
   // Bail out if we reached the end (but still do the comparison)
   br(Assembler::positive, false, Assembler::pn, Lremaining);
-  delayed()->cmp(result, tmp);
+  delayed()->xorcc(tmp, result, tmp);
   // Check equality of elements
   bp(Assembler::equal, false, cc, Assembler::pt, target(Lloop));
   delayed()->load_sized_value(Address(ary1, limit), result, byte_width, false);
@@ -4648,17 +4760,27 @@
   // We got here because we reached the end of the arrays. 'limit' is the number of
   // garbage bytes we may have compared by reading over the end of the arrays. Shift
   // out the garbage and compare the remaining elements.
+  // The elements are pre-compared bitwise, in that result has been xored into tmp.
+  // Also, in the following code, only the low 2-3 bits of 'limit' are significant.
+  // A 'limit' value of either 0 or byte_width means "preserve all bits".
   bind(Lremaining);
   // Optimistic shortcut: elements potentially including garbage are equal
   bp(Assembler::equal, true, cc, Assembler::pt, target(Ldone));
   delayed()->mov(1, result); // equal
   // Shift 'limit' bytes to the right and compare
   sll(limit, 3, limit); // bytes to bits
-  srlx(result, limit, result);
-  srlx(tmp, limit, tmp);
-  cmp(result, tmp);
   clr(result);
-  // CC register contains result
+  if (cc == icc) {
+    srl(tmp, limit, tmp);
+  } else {
+    srlx(tmp, limit, tmp);
+  }
+  if (&Ldone != &Lfallthrough) {
+    ba(Ldone);
+    delayed(); // ->movr(...)
+  }
+  movr(tmp, Assembler::rc_z, 1, result);  // may be a delay slot instruction
+  bind(Lfallthrough);
 }
 
 void MacroAssembler::has_negatives(Register inp, Register size, Register result, Register t2, Register t3, Register t4, Register t5) {