< prev index next >

src/cpu/aarch64/vm/stubGenerator_aarch64.cpp

Print this page
rev 9041 : 8159063: aarch64: optimise unaligned array copy long
Reviewed-by: aph
Contributed-by: edward.nevill@gmail.com, adinn@redhat.com

@@ -802,10 +802,16 @@
     else
       stub_name = "backward_copy_longs";
     StubCodeMark mark(this, "StubRoutines", stub_name);
     __ align(CodeEntryAlignment);
     __ bind(start);
+
+    Label unaligned_copy_long;
+    if (AvoidUnalignedAccesses) {
+      __ tbnz(d, 3, unaligned_copy_long);
+    }
+
     if (direction == copy_forwards) {
       __ sub(s, s, bias);
       __ sub(d, d, bias);
     }
 

@@ -902,10 +908,202 @@
       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
       __ bind(L2);
     }
 
     __ ret(lr);
+
+    if (AvoidUnalignedAccesses) {
+      Label drain, again;
+      // Register order for storing. Order is different for backward copy.
+
+      __ bind(unaligned_copy_long);
+
+      // source address is even aligned, target odd aligned
+      //
+      // when forward copying word pairs we read long pairs at offsets
+      // {0, 2, 4, 6} (in long words). when backwards copying we read
+      // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
+      // address by -2 in the forwards case so we can compute the
+      // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
+      // or -1.
+      //
+      // when forward copying we need to store 1 word, 3 pairs and
+      // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
+      // zero offset We adjust the destination by -1 which means we
+      // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
+      //
+      // When backwards copyng we need to store 1 word, 3 pairs and
+      // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
+      // offsets {1, 3, 5, 7, 8} * unit.
+
+      if (direction == copy_forwards) {
+        __ sub(s, s, 16);
+        __ sub(d, d, 8);
+      }
+
+      // Fill 8 registers
+      //
+      // for forwards copy s was offset by -16 from the original input
+      // value of s so the register contents are at these offsets
+      // relative to the 64 bit block addressed by that original input
+      // and so on for each successive 64 byte block when s is updated
+      //
+      // t0 at offset 0,  t1 at offset 8
+      // t2 at offset 16, t3 at offset 24
+      // t4 at offset 32, t5 at offset 40
+      // t6 at offset 48, t7 at offset 56
+
+      // for backwards copy s was not offset so the register contents
+      // are at these offsets into the preceding 64 byte block
+      // relative to that original input and so on for each successive
+      // preceding 64 byte block when s is updated. this explains the
+      // slightly counter-intuitive looking pattern of register usage
+      // in the stp instructions for backwards copy.
+      //
+      // t0 at offset -16, t1 at offset -8
+      // t2 at offset -32, t3 at offset -24
+      // t4 at offset -48, t5 at offset -40
+      // t6 at offset -64, t7 at offset -56
+
+      __ ldp(t0, t1, Address(s, 2 * unit));
+      __ ldp(t2, t3, Address(s, 4 * unit));
+      __ ldp(t4, t5, Address(s, 6 * unit));
+      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+
+      __ subs(count, count, 16);
+      __ br(Assembler::LO, drain);
+
+      int prefetch = PrefetchCopyIntervalInBytes;
+      bool use_stride = false;
+      if (direction == copy_backwards) {
+         use_stride = prefetch > 256;
+         prefetch = -prefetch;
+         if (use_stride) __ mov(stride, prefetch);
+      }
+
+      __ bind(again);
+
+      if (PrefetchCopyIntervalInBytes > 0)
+        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
+
+      if (direction == copy_forwards) {
+       // allowing for the offset of -8 the store instructions place
+       // registers into the target 64 bit block at the following
+       // offsets
+       //
+       // t0 at offset 0
+       // t1 at offset 8,  t2 at offset 16
+       // t3 at offset 24, t4 at offset 32
+       // t5 at offset 40, t6 at offset 48
+       // t7 at offset 56
+
+        __ str(t0, Address(d, 1 * unit));
+        __ stp(t1, t2, Address(d, 2 * unit));
+        __ ldp(t0, t1, Address(s, 2 * unit));
+        __ stp(t3, t4, Address(d, 4 * unit));
+        __ ldp(t2, t3, Address(s, 4 * unit));
+        __ stp(t5, t6, Address(d, 6 * unit));
+        __ ldp(t4, t5, Address(s, 6 * unit));
+        __ str(t7, Address(__ pre(d, 8 * unit)));
+        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+      } else {
+       // d was not offset when we started so the registers are
+       // written into the 64 bit block preceding d with the following
+       // offsets
+       //
+       // t1 at offset -8
+       // t3 at offset -24, t0 at offset -16
+       // t5 at offset -48, t2 at offset -32
+       // t7 at offset -56, t4 at offset -48
+       //                   t6 at offset -64
+       //
+       // note that this matches the offsets previously noted for the
+       // loads
+
+        __ str(t1, Address(d, 1 * unit));
+        __ stp(t3, t0, Address(d, 3 * unit));
+        __ ldp(t0, t1, Address(s, 2 * unit));
+        __ stp(t5, t2, Address(d, 5 * unit));
+        __ ldp(t2, t3, Address(s, 4 * unit));
+        __ stp(t7, t4, Address(d, 7 * unit));
+        __ ldp(t4, t5, Address(s, 6 * unit));
+        __ str(t6, Address(__ pre(d, 8 * unit)));
+        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+      }
+
+      __ subs(count, count, 8);
+      __ br(Assembler::HS, again);
+
+      // Drain
+      //
+      // this uses the same pattern of offsets and register arguments
+      // as above
+      __ bind(drain);
+      if (direction == copy_forwards) {
+        __ str(t0, Address(d, 1 * unit));
+        __ stp(t1, t2, Address(d, 2 * unit));
+        __ stp(t3, t4, Address(d, 4 * unit));
+        __ stp(t5, t6, Address(d, 6 * unit));
+        __ str(t7, Address(__ pre(d, 8 * unit)));
+      } else {
+        __ str(t1, Address(d, 1 * unit));
+        __ stp(t3, t0, Address(d, 3 * unit));
+        __ stp(t5, t2, Address(d, 5 * unit));
+        __ stp(t7, t4, Address(d, 7 * unit));
+        __ str(t6, Address(__ pre(d, 8 * unit)));
+      }
+      // now we need to copy any remaining part block which may
+      // include a 4 word block subblock and/or a 2 word subblock.
+      // bits 2 and 1 in the count are the tell-tale for whetehr we
+      // have each such subblock
+      {
+        Label L1, L2;
+        __ tbz(count, exact_log2(4), L1);
+       // this is the same as above but copying only 4 longs hence
+       // with ony one intervening stp between the str instructions
+       // but note that the offsets and registers still follow the
+       // same pattern
+        __ ldp(t0, t1, Address(s, 2 * unit));
+        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
+        if (direction == copy_forwards) {
+          __ str(t0, Address(d, 1 * unit));
+          __ stp(t1, t2, Address(d, 2 * unit));
+          __ str(t3, Address(__ pre(d, 4 * unit)));
+        } else {
+          __ str(t1, Address(d, 1 * unit));
+          __ stp(t3, t0, Address(d, 3 * unit));
+          __ str(t2, Address(__ pre(d, 4 * unit)));
+        }
+        __ bind(L1);
+
+        __ tbz(count, 1, L2);
+       // this is the same as above but copying only 2 longs hence
+       // there is no intervening stp between the str instructions
+       // but note that the offset and register patterns are still
+       // the same
+        __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
+        if (direction == copy_forwards) {
+          __ str(t0, Address(d, 1 * unit));
+          __ str(t1, Address(__ pre(d, 2 * unit)));
+        } else {
+          __ str(t1, Address(d, 1 * unit));
+          __ str(t0, Address(__ pre(d, 2 * unit)));
+        }
+        __ bind(L2);
+
+       // for forwards copy we need to re-adjust the offsets we
+       // applied so that s and d are follow the last words written
+
+       if (direction == copy_forwards) {
+         __ add(s, s, 16);
+         __ add(d, d, 8);
+       }
+
+      }
+
+      __ ret(lr);
+      }
   }
 
   // Small copy: less than 16 bytes.
   //
   // NB: Ignores all of the bits of count which represent more than 15

@@ -1025,15 +1223,13 @@
 
     // 65..80/96 bytes
     // (96 bytes if SIMD because we do 32 byes per instruction)
     __ bind(copy80);
     if (UseSIMDForMemoryOps) {
-      __ ldpq(v0, v1, Address(s, 0));
-      __ ldpq(v2, v3, Address(s, 32));
+      __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
       __ ldpq(v4, v5, Address(send, -32));
-      __ stpq(v0, v1, Address(d, 0));
-      __ stpq(v2, v3, Address(d, 32));
+      __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
       __ stpq(v4, v5, Address(dend, -32));
     } else {
       __ ldp(t0, t1, Address(s, 0));
       __ ldp(t2, t3, Address(s, 16));
       __ ldp(t4, t5, Address(s, 32));
< prev index next >