< prev index next >

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp

Print this page
rev 48751 : 8196064: AArch64: Merging ld/st into ldp/stp in macro-assembler
Reviewed-by: duke


1777   //
1778   // result: either
1779   //         quotient  (= ra idiv rb)
1780   //         remainder (= ra irem rb)
1781 
1782   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1783 
1784   int idivq_offset = offset();
1785   if (! want_remainder) {
1786     sdiv(result, ra, rb);
1787   } else {
1788     sdiv(scratch, ra, rb);
1789     Assembler::msub(result, scratch, rb, ra);
1790   }
1791 
1792   return idivq_offset;
1793 }
1794 
1795 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1796   address prev = pc() - NativeMembar::instruction_size;
1797   if (prev == code()->last_membar()) {

1798     NativeMembar *bar = NativeMembar_at(prev);
1799     // We are merging two memory barrier instructions.  On AArch64 we
1800     // can do this simply by ORing them together.
1801     bar->set_kind(bar->get_kind() | order_constraint);
1802     BLOCK_COMMENT("merged membar");
1803   } else {
1804     code()->set_last_membar(pc());
1805     dmb(Assembler::barrier(order_constraint));
1806   }
1807 }
1808 












































1809 // MacroAssembler routines found actually to be needed
1810 
1811 void MacroAssembler::push(Register src)
1812 {
1813   str(src, Address(pre(esp, -1 * wordSize)));
1814 }
1815 
1816 void MacroAssembler::pop(Register dst)
1817 {
1818   ldr(dst, Address(post(esp, 1 * wordSize)));
1819 }
1820 
1821 // Note: load_unsigned_short used to be called load_unsigned_word.
1822 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1823   int off = offset();
1824   ldrh(dst, src);
1825   return off;
1826 }
1827 
1828 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {


2554   assert(offset >= 0, "spill to negative address?");
2555   // Offset reachable ?
2556   //   Not aligned - 9 bits signed offset
2557   //   Aligned - 12 bits unsigned offset shifted
2558   Register base = sp;
2559   if ((offset & (size-1)) && offset >= (1<<8)) {
2560     add(tmp, base, offset & ((1<<12)-1));
2561     base = tmp;
2562     offset &= -1<<12;
2563   }
2564 
2565   if (offset >= (1<<12) * size) {
2566     add(tmp, base, offset & (((1<<12)-1)<<12));
2567     base = tmp;
2568     offset &= ~(((1<<12)-1)<<12);
2569   }
2570 
2571   return Address(base, offset);
2572 }
2573 









































































































































2574 /**
2575  * Multiply 64 bit by 64 bit first loop.
2576  */
2577 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2578                                            Register y, Register y_idx, Register z,
2579                                            Register carry, Register product,
2580                                            Register idx, Register kdx) {
2581   //
2582   //  jlong carry, x[], y[], z[];
2583   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2584   //    huge_128 product = y[idx] * x[xstart] + carry;
2585   //    z[kdx] = (jlong)product;
2586   //    carry  = (jlong)(product >>> 64);
2587   //  }
2588   //  z[xstart] = carry;
2589   //
2590 
2591   Label L_first_loop, L_first_loop_exit;
2592   Label L_one_x, L_one_y, L_multiply;
2593 


4255 //        case 1:
4256 //          p[-1] = 0;
4257 //        case 0:
4258 //          p += 8;
4259 //      } while (cnt);
4260 //    }
4261 
4262   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4263 
4264   lsr(len, len, LogBytesPerWord);
4265   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4266   sub(len, len, rscratch1);      // cnt -= unroll
4267   // t1 always points to the end of the region we're about to zero
4268   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4269   adr(rscratch2, entry);
4270   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4271   br(rscratch2);
4272   bind(loop);
4273   sub(len, len, unroll);
4274   for (int i = -unroll; i < 0; i++)
4275     str(zr, Address(t1, i * wordSize));
4276   bind(entry);
4277   add(t1, t1, unroll * wordSize);
4278   cbnz(len, loop);
4279 }
4280 
4281 // Defines obj, preserves var_size_in_bytes
4282 void MacroAssembler::eden_allocate(Register obj,
4283                                    Register var_size_in_bytes,
4284                                    int con_size_in_bytes,
4285                                    Register t1,
4286                                    Label& slow_case) {
4287   assert_different_registers(obj, var_size_in_bytes, t1);
4288   if (!Universe::heap()->supports_inline_contig_alloc()) {
4289     b(slow_case);
4290   } else {
4291     Register end = t1;
4292     Register heap_end = rscratch2;
4293     Label retry;
4294     bind(retry);
4295     {




1777   //
1778   // result: either
1779   //         quotient  (= ra idiv rb)
1780   //         remainder (= ra irem rb)
1781 
1782   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1783 
1784   int idivq_offset = offset();
1785   if (! want_remainder) {
1786     sdiv(result, ra, rb);
1787   } else {
1788     sdiv(scratch, ra, rb);
1789     Assembler::msub(result, scratch, rb, ra);
1790   }
1791 
1792   return idivq_offset;
1793 }
1794 
1795 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1796   address prev = pc() - NativeMembar::instruction_size;
1797   address last = code()->last_insn();
1798   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1799     NativeMembar *bar = NativeMembar_at(prev);
1800     // We are merging two memory barrier instructions.  On AArch64 we
1801     // can do this simply by ORing them together.
1802     bar->set_kind(bar->get_kind() | order_constraint);
1803     BLOCK_COMMENT("merged membar");
1804   } else {
1805     code()->set_last_insn(pc());
1806     dmb(Assembler::barrier(order_constraint));
1807   }
1808 }
1809 
1810 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1811   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1812     merge_ldst(rt, adr, size_in_bytes, is_store);
1813     code()->clear_last_insn();
1814     return true;
1815   } else {
1816     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1817     const unsigned mask = size_in_bytes - 1;
1818     if (adr.getMode() == Address::base_plus_offset &&
1819         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1820       code()->set_last_insn(pc());
1821     }
1822     return false;
1823   }
1824 }
1825 
1826 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1827   // We always try to merge two adjacent loads into one ldp.
1828   if (!try_merge_ldst(Rx, adr, 8, false)) {
1829     Assembler::ldr(Rx, adr);
1830   }
1831 }
1832 
1833 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1834   // We always try to merge two adjacent loads into one ldp.
1835   if (!try_merge_ldst(Rw, adr, 4, false)) {
1836     Assembler::ldrw(Rw, adr);
1837   }
1838 }
1839 
1840 void MacroAssembler::str(Register Rx, const Address &adr) {
1841   // We always try to merge two adjacent stores into one stp.
1842   if (!try_merge_ldst(Rx, adr, 8, true)) {
1843     Assembler::str(Rx, adr);
1844   }
1845 }
1846 
1847 void MacroAssembler::strw(Register Rw, const Address &adr) {
1848   // We always try to merge two adjacent stores into one stp.
1849   if (!try_merge_ldst(Rw, adr, 4, true)) {
1850     Assembler::strw(Rw, adr);
1851   }
1852 }
1853 
1854 // MacroAssembler routines found actually to be needed
1855 
1856 void MacroAssembler::push(Register src)
1857 {
1858   str(src, Address(pre(esp, -1 * wordSize)));
1859 }
1860 
1861 void MacroAssembler::pop(Register dst)
1862 {
1863   ldr(dst, Address(post(esp, 1 * wordSize)));
1864 }
1865 
1866 // Note: load_unsigned_short used to be called load_unsigned_word.
1867 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1868   int off = offset();
1869   ldrh(dst, src);
1870   return off;
1871 }
1872 
1873 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {


2599   assert(offset >= 0, "spill to negative address?");
2600   // Offset reachable ?
2601   //   Not aligned - 9 bits signed offset
2602   //   Aligned - 12 bits unsigned offset shifted
2603   Register base = sp;
2604   if ((offset & (size-1)) && offset >= (1<<8)) {
2605     add(tmp, base, offset & ((1<<12)-1));
2606     base = tmp;
2607     offset &= -1<<12;
2608   }
2609 
2610   if (offset >= (1<<12) * size) {
2611     add(tmp, base, offset & (((1<<12)-1)<<12));
2612     base = tmp;
2613     offset &= ~(((1<<12)-1)<<12);
2614   }
2615 
2616   return Address(base, offset);
2617 }
2618 
2619 // Checks whether offset is aligned.
2620 // Returns true if it is, else false.
2621 bool MacroAssembler::merge_alignment_check(Register base,
2622                                            size_t size,
2623                                            long cur_offset,
2624                                            long prev_offset) const {
2625   if (AvoidUnalignedAccesses) {
2626     if (base == sp) {
2627       // Checks whether low offset if aligned to pair of registers.
2628       long pair_mask = size * 2 - 1;
2629       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2630       return (offset & pair_mask) == 0;
2631     } else { // If base is not sp, we can't guarantee the access is aligned.
2632       return false;
2633     }
2634   } else {
2635     long mask = size - 1;
2636     // Load/store pair instruction only supports element size aligned offset.
2637     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2638   }
2639 }
2640 
2641 // Checks whether current and previous loads/stores can be merged.
2642 // Returns true if it can be merged, else false.
2643 bool MacroAssembler::ldst_can_merge(Register rt,
2644                                     const Address &adr,
2645                                     size_t cur_size_in_bytes,
2646                                     bool is_store) const {
2647   address prev = pc() - NativeInstruction::instruction_size;
2648   address last = code()->last_insn();
2649 
2650   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2651     return false;
2652   }
2653 
2654   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2655     return false;
2656   }
2657 
2658   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2659   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2660 
2661   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2662   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2663 
2664   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2665     return false;
2666   }
2667 
2668   long max_offset = 63 * prev_size_in_bytes;
2669   long min_offset = -64 * prev_size_in_bytes;
2670 
2671   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2672 
2673   // Only same base can be merged.
2674   if (adr.base() != prev_ldst->base()) {
2675     return false;
2676   }
2677 
2678   long cur_offset = adr.offset();
2679   long prev_offset = prev_ldst->offset();
2680   size_t diff = abs(cur_offset - prev_offset);
2681   if (diff != prev_size_in_bytes) {
2682     return false;
2683   }
2684 
2685   // Following cases can not be merged:
2686   // ldr x2, [x2, #8]
2687   // ldr x3, [x2, #16]
2688   // or:
2689   // ldr x2, [x3, #8]
2690   // ldr x2, [x3, #16]
2691   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2692   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2693     return false;
2694   }
2695 
2696   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2697   // Offset range must be in ldp/stp instruction's range.
2698   if (low_offset > max_offset || low_offset < min_offset) {
2699     return false;
2700   }
2701 
2702   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2703     return true;
2704   }
2705 
2706   return false;
2707 }
2708 
2709 // Merge current load/store with previous load/store into ldp/stp.
2710 void MacroAssembler::merge_ldst(Register rt,
2711                                 const Address &adr,
2712                                 size_t cur_size_in_bytes,
2713                                 bool is_store) {
2714 
2715   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2716 
2717   Register rt_low, rt_high;
2718   address prev = pc() - NativeInstruction::instruction_size;
2719   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2720 
2721   long offset;
2722 
2723   if (adr.offset() < prev_ldst->offset()) {
2724     offset = adr.offset();
2725     rt_low = rt;
2726     rt_high = prev_ldst->target();
2727   } else {
2728     offset = prev_ldst->offset();
2729     rt_low = prev_ldst->target();
2730     rt_high = rt;
2731   }
2732 
2733   Address adr_p = Address(prev_ldst->base(), offset);
2734   // Overwrite previous generated binary.
2735   code_section()->set_end(prev);
2736 
2737   const int sz = prev_ldst->size_in_bytes();
2738   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2739   if (!is_store) {
2740     BLOCK_COMMENT("merged ldr pair");
2741     if (sz == 8) {
2742       ldp(rt_low, rt_high, adr_p);
2743     } else {
2744       ldpw(rt_low, rt_high, adr_p);
2745     }
2746   } else {
2747     BLOCK_COMMENT("merged str pair");
2748     if (sz == 8) {
2749       stp(rt_low, rt_high, adr_p);
2750     } else {
2751       stpw(rt_low, rt_high, adr_p);
2752     }
2753   }
2754 }
2755 
2756 /**
2757  * Multiply 64 bit by 64 bit first loop.
2758  */
2759 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2760                                            Register y, Register y_idx, Register z,
2761                                            Register carry, Register product,
2762                                            Register idx, Register kdx) {
2763   //
2764   //  jlong carry, x[], y[], z[];
2765   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2766   //    huge_128 product = y[idx] * x[xstart] + carry;
2767   //    z[kdx] = (jlong)product;
2768   //    carry  = (jlong)(product >>> 64);
2769   //  }
2770   //  z[xstart] = carry;
2771   //
2772 
2773   Label L_first_loop, L_first_loop_exit;
2774   Label L_one_x, L_one_y, L_multiply;
2775 


4437 //        case 1:
4438 //          p[-1] = 0;
4439 //        case 0:
4440 //          p += 8;
4441 //      } while (cnt);
4442 //    }
4443 
4444   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4445 
4446   lsr(len, len, LogBytesPerWord);
4447   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4448   sub(len, len, rscratch1);      // cnt -= unroll
4449   // t1 always points to the end of the region we're about to zero
4450   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4451   adr(rscratch2, entry);
4452   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4453   br(rscratch2);
4454   bind(loop);
4455   sub(len, len, unroll);
4456   for (int i = -unroll; i < 0; i++)
4457     Assembler::str(zr, Address(t1, i * wordSize));
4458   bind(entry);
4459   add(t1, t1, unroll * wordSize);
4460   cbnz(len, loop);
4461 }
4462 
4463 // Defines obj, preserves var_size_in_bytes
4464 void MacroAssembler::eden_allocate(Register obj,
4465                                    Register var_size_in_bytes,
4466                                    int con_size_in_bytes,
4467                                    Register t1,
4468                                    Label& slow_case) {
4469   assert_different_registers(obj, var_size_in_bytes, t1);
4470   if (!Universe::heap()->supports_inline_contig_alloc()) {
4471     b(slow_case);
4472   } else {
4473     Register end = t1;
4474     Register heap_end = rscratch2;
4475     Label retry;
4476     bind(retry);
4477     {


< prev index next >