jdk-hs Cdiff src/hotspot/cpu/aarch64/macroAssembler

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp

rev 48555 : 8196064: AArch64: Merging ld/st into ldp/stp in macro-assembler
Reviewed-by: duke


*** 1803,1812 ****
--- 1803,1856 ----
      code()->set_last_membar(pc());
      dmb(Assembler::barrier(order_constraint));
    }
  }
  
+ bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
+   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
+     merge_ldst(rt, adr, size_in_bytes, is_store);
+     code()->clear_last_ldst();
+     return true;
+   } else {
+     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
+     const unsigned mask = size_in_bytes - 1;
+     if (adr.getMode() == Address::base_plus_offset &&
+         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
+       code()->set_last_ldst(pc());
+     }
+     return false;
+   }
+ }
+ 
+ void MacroAssembler::ldr(Register Rx, const Address &adr) {
+   // We always try to merge two adjacent loads into one ldp.
+   if (!try_merge_ldst(Rx, adr, 8, false)) {
+     Assembler::ldr(Rx, adr);
+   }
+ }
+ 
+ void MacroAssembler::ldrw(Register Rw, const Address &adr) {
+   // We always try to merge two adjacent loads into one ldp.
+   if (!try_merge_ldst(Rw, adr, 4, false)) {
+     Assembler::ldrw(Rw, adr);
+   }
+ }
+ 
+ void MacroAssembler::str(Register Rx, const Address &adr) {
+   // We always try to merge two adjacent stores into one stp.
+   if (!try_merge_ldst(Rx, adr, 8, true)) {
+     Assembler::str(Rx, adr);
+   }
+ }
+ 
+ void MacroAssembler::strw(Register Rw, const Address &adr) {
+   // We always try to merge two adjacent stores into one stp.
+   if (!try_merge_ldst(Rw, adr, 4, true)) {
+     Assembler::strw(Rw, adr);
+   }
+ }
+ 
  // MacroAssembler routines found actually to be needed
  
  void MacroAssembler::push(Register src)
  {
    str(src, Address(pre(esp, -1 * wordSize)));
*** 2568,2577 ****
--- 2612,2752 ----
    }
  
    return Address(base, offset);
  }
  
+ // Checks whether offset is aligned.
+ // Returns true if it is, else false.
+ bool MacroAssembler::merge_alignment_check(Register base,
+                                            size_t size,
+                                            long cur_offset,
+                                            long prev_offset) const {
+   if (AvoidUnalignedAccesses) {
+     if (base == sp) {
+       // Checks whether low offset if aligned to pair of registers.
+       long pair_mask = size * 2 - 1;
+       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
+       return (offset & pair_mask) == 0;
+     } else { // If base is not sp, we can't guarantee the access is aligned.
+       return false;
+     }
+   } else {
+     long mask = size - 1;
+     // Load/store pair instruction only supports element size aligned offset.
+     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
+   }
+ }
+ 
+ // Checks whether current and previous loads/stores can be merged.
+ // Returns true if it can be merged, else false.
+ bool MacroAssembler::ldst_can_merge(Register rt,
+                                     const Address &adr,
+                                     size_t cur_size_in_bytes,
+                                     bool is_store) const {
+   address prev = pc() - NativeInstruction::instruction_size;
+   address last = code()->last_ldst();
+ 
+   if (adr.getMode() != Address::base_plus_offset || prev != last) {
+     return false;
+   }
+ 
+   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
+   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
+ 
+   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
+   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
+ 
+   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
+     return false;
+   }
+ 
+   long max_offset = 63 * prev_size_in_bytes;
+   long min_offset = -64 * prev_size_in_bytes;
+ 
+   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
+ 
+   // Only same base can be merged.
+   if (adr.base() != prev_ldst->base()) {
+     return false;
+   }
+ 
+   long cur_offset = adr.offset();
+   long prev_offset = prev_ldst->offset();
+   size_t diff = abs(cur_offset - prev_offset);
+   if (diff != prev_size_in_bytes) {
+     return false;
+   }
+ 
+   // Following cases can not be merged:
+   // ldr x2, [x2, #8]
+   // ldr x3, [x2, #16]
+   // or:
+   // ldr x2, [x3, #8]
+   // ldr x2, [x3, #16]
+   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
+   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
+     return false;
+   }
+ 
+   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
+   // Offset range must be in ldp/stp instruction's range.
+   if (low_offset > max_offset || low_offset < min_offset) {
+     return false;
+   }
+ 
+   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
+     return true;
+   }
+ 
+   return false;
+ }
+ 
+ // Merge current load/store with previous load/store into ldp/stp.
+ void MacroAssembler::merge_ldst(Register rt,
+                                 const Address &adr,
+                                 size_t cur_size_in_bytes,
+                                 bool is_store) {
+ 
+   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
+ 
+   Register rt_low, rt_high;
+   address prev = pc() - NativeInstruction::instruction_size;
+   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
+ 
+   long offset;
+ 
+   if (adr.offset() < prev_ldst->offset()) {
+     offset = adr.offset();
+     rt_low = rt;
+     rt_high = prev_ldst->target();
+   } else {
+     offset = prev_ldst->offset();
+     rt_low = prev_ldst->target();
+     rt_high = rt;
+   }
+ 
+   Address adr_p = Address(prev_ldst->base(), offset);
+   // Overwrite previous generated binary.
+   code_section()->set_end(prev);
+ 
+   const int sz = prev_ldst->size_in_bytes();
+   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
+   if (!is_store) {
+     if (sz == 8) {
+       ldp(rt_low, rt_high, adr_p);
+     } else {
+       ldpw(rt_low, rt_high, adr_p);
+     }
+   } else {
+     if (sz == 8) {
+       stp(rt_low, rt_high, adr_p);
+     } else {
+       stpw(rt_low, rt_high, adr_p);
+     }
+   }
+ }
+ 
  /**
   * Multiply 64 bit by 64 bit first loop.
   */
  void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
                                             Register y, Register y_idx, Register z,
*** 4269,4279 ****
    sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
    br(rscratch2);
    bind(loop);
    sub(len, len, unroll);
    for (int i = -unroll; i < 0; i++)
!     str(zr, Address(t1, i * wordSize));
    bind(entry);
    add(t1, t1, unroll * wordSize);
    cbnz(len, loop);
  }
  
--- 4444,4454 ----
    sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
    br(rscratch2);
    bind(loop);
    sub(len, len, unroll);
    for (int i = -unroll; i < 0; i++)
!     Assembler::str(zr, Address(t1, i * wordSize));
    bind(entry);
    add(t1, t1, unroll * wordSize);
    cbnz(len, loop);
  }

< prev index next >