1777 //
1778 // result: either
1779 // quotient (= ra idiv rb)
1780 // remainder (= ra irem rb)
1781
1782 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1783
1784 int idivq_offset = offset();
1785 if (! want_remainder) {
1786 sdiv(result, ra, rb);
1787 } else {
1788 sdiv(scratch, ra, rb);
1789 Assembler::msub(result, scratch, rb, ra);
1790 }
1791
1792 return idivq_offset;
1793 }
1794
1795 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1796 address prev = pc() - NativeMembar::instruction_size;
1797 if (prev == code()->last_membar()) {
1798 NativeMembar *bar = NativeMembar_at(prev);
1799 // We are merging two memory barrier instructions. On AArch64 we
1800 // can do this simply by ORing them together.
1801 bar->set_kind(bar->get_kind() | order_constraint);
1802 BLOCK_COMMENT("merged membar");
1803 } else {
1804 code()->set_last_membar(pc());
1805 dmb(Assembler::barrier(order_constraint));
1806 }
1807 }
1808
1809 // MacroAssembler routines found actually to be needed
1810
1811 void MacroAssembler::push(Register src)
1812 {
1813 str(src, Address(pre(esp, -1 * wordSize)));
1814 }
1815
1816 void MacroAssembler::pop(Register dst)
1817 {
1818 ldr(dst, Address(post(esp, 1 * wordSize)));
1819 }
1820
1821 // Note: load_unsigned_short used to be called load_unsigned_word.
1822 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1823 int off = offset();
1824 ldrh(dst, src);
1825 return off;
1826 }
1827
1828 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2554 assert(offset >= 0, "spill to negative address?");
2555 // Offset reachable ?
2556 // Not aligned - 9 bits signed offset
2557 // Aligned - 12 bits unsigned offset shifted
2558 Register base = sp;
2559 if ((offset & (size-1)) && offset >= (1<<8)) {
2560 add(tmp, base, offset & ((1<<12)-1));
2561 base = tmp;
2562 offset &= -1<<12;
2563 }
2564
2565 if (offset >= (1<<12) * size) {
2566 add(tmp, base, offset & (((1<<12)-1)<<12));
2567 base = tmp;
2568 offset &= ~(((1<<12)-1)<<12);
2569 }
2570
2571 return Address(base, offset);
2572 }
2573
2574 /**
2575 * Multiply 64 bit by 64 bit first loop.
2576 */
2577 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2578 Register y, Register y_idx, Register z,
2579 Register carry, Register product,
2580 Register idx, Register kdx) {
2581 //
2582 // jlong carry, x[], y[], z[];
2583 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2584 // huge_128 product = y[idx] * x[xstart] + carry;
2585 // z[kdx] = (jlong)product;
2586 // carry = (jlong)(product >>> 64);
2587 // }
2588 // z[xstart] = carry;
2589 //
2590
2591 Label L_first_loop, L_first_loop_exit;
2592 Label L_one_x, L_one_y, L_multiply;
2593
4255 // case 1:
4256 // p[-1] = 0;
4257 // case 0:
4258 // p += 8;
4259 // } while (cnt);
4260 // }
4261
4262 const int unroll = 8; // Number of str(zr) instructions we'll unroll
4263
4264 lsr(len, len, LogBytesPerWord);
4265 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll
4266 sub(len, len, rscratch1); // cnt -= unroll
4267 // t1 always points to the end of the region we're about to zero
4268 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4269 adr(rscratch2, entry);
4270 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4271 br(rscratch2);
4272 bind(loop);
4273 sub(len, len, unroll);
4274 for (int i = -unroll; i < 0; i++)
4275 str(zr, Address(t1, i * wordSize));
4276 bind(entry);
4277 add(t1, t1, unroll * wordSize);
4278 cbnz(len, loop);
4279 }
4280
4281 // Defines obj, preserves var_size_in_bytes
4282 void MacroAssembler::eden_allocate(Register obj,
4283 Register var_size_in_bytes,
4284 int con_size_in_bytes,
4285 Register t1,
4286 Label& slow_case) {
4287 assert_different_registers(obj, var_size_in_bytes, t1);
4288 if (!Universe::heap()->supports_inline_contig_alloc()) {
4289 b(slow_case);
4290 } else {
4291 Register end = t1;
4292 Register heap_end = rscratch2;
4293 Label retry;
4294 bind(retry);
4295 {
|
1777 //
1778 // result: either
1779 // quotient (= ra idiv rb)
1780 // remainder (= ra irem rb)
1781
1782 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1783
1784 int idivq_offset = offset();
1785 if (! want_remainder) {
1786 sdiv(result, ra, rb);
1787 } else {
1788 sdiv(scratch, ra, rb);
1789 Assembler::msub(result, scratch, rb, ra);
1790 }
1791
1792 return idivq_offset;
1793 }
1794
1795 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1796 address prev = pc() - NativeMembar::instruction_size;
1797 address last = code()->last_insn();
1798 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1799 NativeMembar *bar = NativeMembar_at(prev);
1800 // We are merging two memory barrier instructions. On AArch64 we
1801 // can do this simply by ORing them together.
1802 bar->set_kind(bar->get_kind() | order_constraint);
1803 BLOCK_COMMENT("merged membar");
1804 } else {
1805 code()->set_last_insn(pc());
1806 dmb(Assembler::barrier(order_constraint));
1807 }
1808 }
1809
1810 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1811 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1812 merge_ldst(rt, adr, size_in_bytes, is_store);
1813 code()->clear_last_insn();
1814 return true;
1815 } else {
1816 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1817 const unsigned mask = size_in_bytes - 1;
1818 if (adr.getMode() == Address::base_plus_offset &&
1819 (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1820 code()->set_last_insn(pc());
1821 }
1822 return false;
1823 }
1824 }
1825
1826 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1827 // We always try to merge two adjacent loads into one ldp.
1828 if (!try_merge_ldst(Rx, adr, 8, false)) {
1829 Assembler::ldr(Rx, adr);
1830 }
1831 }
1832
1833 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1834 // We always try to merge two adjacent loads into one ldp.
1835 if (!try_merge_ldst(Rw, adr, 4, false)) {
1836 Assembler::ldrw(Rw, adr);
1837 }
1838 }
1839
1840 void MacroAssembler::str(Register Rx, const Address &adr) {
1841 // We always try to merge two adjacent stores into one stp.
1842 if (!try_merge_ldst(Rx, adr, 8, true)) {
1843 Assembler::str(Rx, adr);
1844 }
1845 }
1846
1847 void MacroAssembler::strw(Register Rw, const Address &adr) {
1848 // We always try to merge two adjacent stores into one stp.
1849 if (!try_merge_ldst(Rw, adr, 4, true)) {
1850 Assembler::strw(Rw, adr);
1851 }
1852 }
1853
1854 // MacroAssembler routines found actually to be needed
1855
1856 void MacroAssembler::push(Register src)
1857 {
1858 str(src, Address(pre(esp, -1 * wordSize)));
1859 }
1860
1861 void MacroAssembler::pop(Register dst)
1862 {
1863 ldr(dst, Address(post(esp, 1 * wordSize)));
1864 }
1865
1866 // Note: load_unsigned_short used to be called load_unsigned_word.
1867 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1868 int off = offset();
1869 ldrh(dst, src);
1870 return off;
1871 }
1872
1873 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2599 assert(offset >= 0, "spill to negative address?");
2600 // Offset reachable ?
2601 // Not aligned - 9 bits signed offset
2602 // Aligned - 12 bits unsigned offset shifted
2603 Register base = sp;
2604 if ((offset & (size-1)) && offset >= (1<<8)) {
2605 add(tmp, base, offset & ((1<<12)-1));
2606 base = tmp;
2607 offset &= -1<<12;
2608 }
2609
2610 if (offset >= (1<<12) * size) {
2611 add(tmp, base, offset & (((1<<12)-1)<<12));
2612 base = tmp;
2613 offset &= ~(((1<<12)-1)<<12);
2614 }
2615
2616 return Address(base, offset);
2617 }
2618
2619 // Checks whether offset is aligned.
2620 // Returns true if it is, else false.
2621 bool MacroAssembler::merge_alignment_check(Register base,
2622 size_t size,
2623 long cur_offset,
2624 long prev_offset) const {
2625 if (AvoidUnalignedAccesses) {
2626 if (base == sp) {
2627 // Checks whether low offset if aligned to pair of registers.
2628 long pair_mask = size * 2 - 1;
2629 long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2630 return (offset & pair_mask) == 0;
2631 } else { // If base is not sp, we can't guarantee the access is aligned.
2632 return false;
2633 }
2634 } else {
2635 long mask = size - 1;
2636 // Load/store pair instruction only supports element size aligned offset.
2637 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2638 }
2639 }
2640
2641 // Checks whether current and previous loads/stores can be merged.
2642 // Returns true if it can be merged, else false.
2643 bool MacroAssembler::ldst_can_merge(Register rt,
2644 const Address &adr,
2645 size_t cur_size_in_bytes,
2646 bool is_store) const {
2647 address prev = pc() - NativeInstruction::instruction_size;
2648 address last = code()->last_insn();
2649
2650 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2651 return false;
2652 }
2653
2654 if (adr.getMode() != Address::base_plus_offset || prev != last) {
2655 return false;
2656 }
2657
2658 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2659 size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2660
2661 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2662 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2663
2664 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2665 return false;
2666 }
2667
2668 long max_offset = 63 * prev_size_in_bytes;
2669 long min_offset = -64 * prev_size_in_bytes;
2670
2671 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2672
2673 // Only same base can be merged.
2674 if (adr.base() != prev_ldst->base()) {
2675 return false;
2676 }
2677
2678 long cur_offset = adr.offset();
2679 long prev_offset = prev_ldst->offset();
2680 size_t diff = abs(cur_offset - prev_offset);
2681 if (diff != prev_size_in_bytes) {
2682 return false;
2683 }
2684
2685 // Following cases can not be merged:
2686 // ldr x2, [x2, #8]
2687 // ldr x3, [x2, #16]
2688 // or:
2689 // ldr x2, [x3, #8]
2690 // ldr x2, [x3, #16]
2691 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2692 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2693 return false;
2694 }
2695
2696 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2697 // Offset range must be in ldp/stp instruction's range.
2698 if (low_offset > max_offset || low_offset < min_offset) {
2699 return false;
2700 }
2701
2702 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2703 return true;
2704 }
2705
2706 return false;
2707 }
2708
2709 // Merge current load/store with previous load/store into ldp/stp.
2710 void MacroAssembler::merge_ldst(Register rt,
2711 const Address &adr,
2712 size_t cur_size_in_bytes,
2713 bool is_store) {
2714
2715 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2716
2717 Register rt_low, rt_high;
2718 address prev = pc() - NativeInstruction::instruction_size;
2719 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2720
2721 long offset;
2722
2723 if (adr.offset() < prev_ldst->offset()) {
2724 offset = adr.offset();
2725 rt_low = rt;
2726 rt_high = prev_ldst->target();
2727 } else {
2728 offset = prev_ldst->offset();
2729 rt_low = prev_ldst->target();
2730 rt_high = rt;
2731 }
2732
2733 Address adr_p = Address(prev_ldst->base(), offset);
2734 // Overwrite previous generated binary.
2735 code_section()->set_end(prev);
2736
2737 const int sz = prev_ldst->size_in_bytes();
2738 assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2739 if (!is_store) {
2740 BLOCK_COMMENT("merged ldr pair");
2741 if (sz == 8) {
2742 ldp(rt_low, rt_high, adr_p);
2743 } else {
2744 ldpw(rt_low, rt_high, adr_p);
2745 }
2746 } else {
2747 BLOCK_COMMENT("merged str pair");
2748 if (sz == 8) {
2749 stp(rt_low, rt_high, adr_p);
2750 } else {
2751 stpw(rt_low, rt_high, adr_p);
2752 }
2753 }
2754 }
2755
2756 /**
2757 * Multiply 64 bit by 64 bit first loop.
2758 */
2759 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2760 Register y, Register y_idx, Register z,
2761 Register carry, Register product,
2762 Register idx, Register kdx) {
2763 //
2764 // jlong carry, x[], y[], z[];
2765 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2766 // huge_128 product = y[idx] * x[xstart] + carry;
2767 // z[kdx] = (jlong)product;
2768 // carry = (jlong)(product >>> 64);
2769 // }
2770 // z[xstart] = carry;
2771 //
2772
2773 Label L_first_loop, L_first_loop_exit;
2774 Label L_one_x, L_one_y, L_multiply;
2775
4437 // case 1:
4438 // p[-1] = 0;
4439 // case 0:
4440 // p += 8;
4441 // } while (cnt);
4442 // }
4443
4444 const int unroll = 8; // Number of str(zr) instructions we'll unroll
4445
4446 lsr(len, len, LogBytesPerWord);
4447 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll
4448 sub(len, len, rscratch1); // cnt -= unroll
4449 // t1 always points to the end of the region we're about to zero
4450 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4451 adr(rscratch2, entry);
4452 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4453 br(rscratch2);
4454 bind(loop);
4455 sub(len, len, unroll);
4456 for (int i = -unroll; i < 0; i++)
4457 Assembler::str(zr, Address(t1, i * wordSize));
4458 bind(entry);
4459 add(t1, t1, unroll * wordSize);
4460 cbnz(len, loop);
4461 }
4462
4463 // Defines obj, preserves var_size_in_bytes
4464 void MacroAssembler::eden_allocate(Register obj,
4465 Register var_size_in_bytes,
4466 int con_size_in_bytes,
4467 Register t1,
4468 Label& slow_case) {
4469 assert_different_registers(obj, var_size_in_bytes, t1);
4470 if (!Universe::heap()->supports_inline_contig_alloc()) {
4471 b(slow_case);
4472 } else {
4473 Register end = t1;
4474 Register heap_end = rscratch2;
4475 Label retry;
4476 bind(retry);
4477 {
|