# HG changeset patch # User zyao # Date 1517625418 -28800 # Sat Feb 03 10:36:58 2018 +0800 # Node ID 31a2d4127831ef292a347c012a23301cefaecf0b # Parent 47f19ff9903c2a7b4729e2a4213f53886266e6b9 8196064: AArch64: Merging ld/st into ldp/stp in macro-assembler Reviewed-by: duke diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -1794,18 +1794,63 @@ void MacroAssembler::membar(Membar_mask_bits order_constraint) { address prev = pc() - NativeMembar::instruction_size; - if (prev == code()->last_membar()) { + address last = code()->last_insn(); + if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { NativeMembar *bar = NativeMembar_at(prev); // We are merging two memory barrier instructions. On AArch64 we // can do this simply by ORing them together. bar->set_kind(bar->get_kind() | order_constraint); BLOCK_COMMENT("merged membar"); } else { - code()->set_last_membar(pc()); + code()->set_last_insn(pc()); dmb(Assembler::barrier(order_constraint)); } } +bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { + if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { + merge_ldst(rt, adr, size_in_bytes, is_store); + code()->clear_last_insn(); + return true; + } else { + assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); + const unsigned mask = size_in_bytes - 1; + if (adr.getMode() == Address::base_plus_offset && + (adr.offset() & mask) == 0) { // only supports base_plus_offset. + code()->set_last_insn(pc()); + } + return false; + } +} + +void MacroAssembler::ldr(Register Rx, const Address &adr) { + // We always try to merge two adjacent loads into one ldp. + if (!try_merge_ldst(Rx, adr, 8, false)) { + Assembler::ldr(Rx, adr); + } +} + +void MacroAssembler::ldrw(Register Rw, const Address &adr) { + // We always try to merge two adjacent loads into one ldp. + if (!try_merge_ldst(Rw, adr, 4, false)) { + Assembler::ldrw(Rw, adr); + } +} + +void MacroAssembler::str(Register Rx, const Address &adr) { + // We always try to merge two adjacent stores into one stp. + if (!try_merge_ldst(Rx, adr, 8, true)) { + Assembler::str(Rx, adr); + } +} + +void MacroAssembler::strw(Register Rw, const Address &adr) { + // We always try to merge two adjacent stores into one stp. + if (!try_merge_ldst(Rw, adr, 4, true)) { + Assembler::strw(Rw, adr); + } +} + // MacroAssembler routines found actually to be needed void MacroAssembler::push(Register src) @@ -2571,6 +2616,143 @@ return Address(base, offset); } +// Checks whether offset is aligned. +// Returns true if it is, else false. +bool MacroAssembler::merge_alignment_check(Register base, + size_t size, + long cur_offset, + long prev_offset) const { + if (AvoidUnalignedAccesses) { + if (base == sp) { + // Checks whether low offset if aligned to pair of registers. + long pair_mask = size * 2 - 1; + long offset = prev_offset > cur_offset ? cur_offset : prev_offset; + return (offset & pair_mask) == 0; + } else { // If base is not sp, we can't guarantee the access is aligned. + return false; + } + } else { + long mask = size - 1; + // Load/store pair instruction only supports element size aligned offset. + return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; + } +} + +// Checks whether current and previous loads/stores can be merged. +// Returns true if it can be merged, else false. +bool MacroAssembler::ldst_can_merge(Register rt, + const Address &adr, + size_t cur_size_in_bytes, + bool is_store) const { + address prev = pc() - NativeInstruction::instruction_size; + address last = code()->last_insn(); + + if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { + return false; + } + + if (adr.getMode() != Address::base_plus_offset || prev != last) { + return false; + } + + NativeLdSt* prev_ldst = NativeLdSt_at(prev); + size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); + + assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); + assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); + + if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { + return false; + } + + long max_offset = 63 * prev_size_in_bytes; + long min_offset = -64 * prev_size_in_bytes; + + assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); + + // Only same base can be merged. + if (adr.base() != prev_ldst->base()) { + return false; + } + + long cur_offset = adr.offset(); + long prev_offset = prev_ldst->offset(); + size_t diff = abs(cur_offset - prev_offset); + if (diff != prev_size_in_bytes) { + return false; + } + + // Following cases can not be merged: + // ldr x2, [x2, #8] + // ldr x3, [x2, #16] + // or: + // ldr x2, [x3, #8] + // ldr x2, [x3, #16] + // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. + if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { + return false; + } + + long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; + // Offset range must be in ldp/stp instruction's range. + if (low_offset > max_offset || low_offset < min_offset) { + return false; + } + + if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { + return true; + } + + return false; +} + +// Merge current load/store with previous load/store into ldp/stp. +void MacroAssembler::merge_ldst(Register rt, + const Address &adr, + size_t cur_size_in_bytes, + bool is_store) { + + assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); + + Register rt_low, rt_high; + address prev = pc() - NativeInstruction::instruction_size; + NativeLdSt* prev_ldst = NativeLdSt_at(prev); + + long offset; + + if (adr.offset() < prev_ldst->offset()) { + offset = adr.offset(); + rt_low = rt; + rt_high = prev_ldst->target(); + } else { + offset = prev_ldst->offset(); + rt_low = prev_ldst->target(); + rt_high = rt; + } + + Address adr_p = Address(prev_ldst->base(), offset); + // Overwrite previous generated binary. + code_section()->set_end(prev); + + const int sz = prev_ldst->size_in_bytes(); + assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); + if (!is_store) { + BLOCK_COMMENT("merged ldr pair"); + if (sz == 8) { + ldp(rt_low, rt_high, adr_p); + } else { + ldpw(rt_low, rt_high, adr_p); + } + } else { + BLOCK_COMMENT("merged str pair"); + if (sz == 8) { + stp(rt_low, rt_high, adr_p); + } else { + stpw(rt_low, rt_high, adr_p); + } + } +} + /** * Multiply 64 bit by 64 bit first loop. */ @@ -4272,7 +4454,7 @@ bind(loop); sub(len, len, unroll); for (int i = -unroll; i < 0; i++) - str(zr, Address(t1, i * wordSize)); + Assembler::str(zr, Address(t1, i * wordSize)); bind(entry); add(t1, t1, unroll * wordSize); cbnz(len, loop); diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -150,11 +150,19 @@ void bind(Label& L) { Assembler::bind(L); - code()->clear_last_membar(); + code()->clear_last_insn(); } void membar(Membar_mask_bits order_constraint); + using Assembler::ldr; + using Assembler::str; + + void ldr(Register Rx, const Address &adr); + void ldrw(Register Rw, const Address &adr); + void str(Register Rx, const Address &adr); + void strw(Register Rx, const Address &adr); + // Frame creation and destruction shared between JITs. void build_frame(int framesize); void remove_frame(int framesize); @@ -1291,6 +1299,17 @@ // Uses rscratch2 if the address is not directly reachable Address spill_address(int size, int offset, Register tmp=rscratch2); + bool merge_alignment_check(Register base, size_t size, long cur_offset, long prev_offset) const; + + // Check whether two loads/stores can be merged into ldp/stp. + bool ldst_can_merge(Register rx, const Address &adr, size_t cur_size_in_bytes, bool is_store) const; + + // Merge current load/store with previous load/store into ldp/stp. + void merge_ldst(Register rx, const Address &adr, size_t cur_size_in_bytes, bool is_store); + + // Try to merge two loads/stores into ldp/stp. If success, returns true else false. + bool try_merge_ldst(Register rt, const Address &adr, size_t cur_size_in_bytes, bool is_store); + public: void spill(Register Rx, bool is64, int offset) { if (is64) { diff --git a/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp b/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp --- a/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp @@ -131,6 +131,13 @@ return Instruction_aarch64::extract(insn, 31, 12) == 0b11010101000000110011 && Instruction_aarch64::extract(insn, 7, 0) == 0b10111111; } + + bool is_Imm_LdSt() { + unsigned int insn = uint_at(0); + return Instruction_aarch64::extract(insn, 29, 27) == 0b111 && + Instruction_aarch64::extract(insn, 23, 23) == 0b0 && + Instruction_aarch64::extract(insn, 26, 25) == 0b00; + } }; inline NativeInstruction* nativeInstruction_at(address address) { @@ -532,4 +539,57 @@ return (NativeMembar*)addr; } +class NativeLdSt : public NativeInstruction { +private: + int32_t size() { return Instruction_aarch64::extract(uint_at(0), 31, 30); } + // Check whether instruction is with unscaled offset. + bool is_ldst_ur() { + return (Instruction_aarch64::extract(uint_at(0), 29, 21) == 0b111000010 || + Instruction_aarch64::extract(uint_at(0), 29, 21) == 0b111000000) && + Instruction_aarch64::extract(uint_at(0), 11, 10) == 0b00; + } + bool is_ldst_unsigned_offset() { + return Instruction_aarch64::extract(uint_at(0), 29, 22) == 0b11100101 || + Instruction_aarch64::extract(uint_at(0), 29, 22) == 0b11100100; + } +public: + Register target() { + uint32_t r = Instruction_aarch64::extract(uint_at(0), 4, 0); + return r == 0x1f ? zr : as_Register(r); + } + Register base() { + uint32_t b = Instruction_aarch64::extract(uint_at(0), 9, 5); + return b == 0x1f ? sp : as_Register(b); + } + int64_t offset() { + if (is_ldst_ur()) { + return Instruction_aarch64::sextract(uint_at(0), 20, 12); + } else if (is_ldst_unsigned_offset()) { + return Instruction_aarch64::extract(uint_at(0), 21, 10) << size(); + } else { + // others like: pre-index or post-index. + ShouldNotReachHere(); + return 0; + } + } + size_t size_in_bytes() { return 1 << size(); } + bool is_not_pre_post_index() { return (is_ldst_ur() || is_ldst_unsigned_offset()); } + bool is_load() { + assert(Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01 || + Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00, "must be ldr or str"); + + return Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01; + } + bool is_store() { + assert(Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01 || + Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00, "must be ldr or str"); + + return Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00; + } +}; + +inline NativeLdSt *NativeLdSt_at(address addr) { + assert(nativeInstruction_at(addr)->is_Imm_LdSt(), "no immediate load/store found"); + return (NativeLdSt*)addr; +} #endif // CPU_AARCH64_VM_NATIVEINST_AARCH64_HPP diff --git a/src/hotspot/share/asm/codeBuffer.hpp b/src/hotspot/share/asm/codeBuffer.hpp --- a/src/hotspot/share/asm/codeBuffer.hpp +++ b/src/hotspot/share/asm/codeBuffer.hpp @@ -380,7 +380,7 @@ OopRecorder _default_oop_recorder; // override with initialize_oop_recorder Arena* _overflow_arena; - address _last_membar; // used to merge consecutive memory barriers + address _last_insn; // used to merge consecutive memory barriers, loads or stores. address _decode_begin; // start address for decode address decode_begin(); @@ -395,7 +395,7 @@ _decode_begin = NULL; _overflow_arena = NULL; _code_strings = CodeStrings(); - _last_membar = NULL; + _last_insn = NULL; } void initialize(address code_start, csize_t code_size) { @@ -587,9 +587,9 @@ OopRecorder* oop_recorder() const { return _oop_recorder; } CodeStrings& strings() { return _code_strings; } - address last_membar() const { return _last_membar; } - void set_last_membar(address a) { _last_membar = a; } - void clear_last_membar() { set_last_membar(NULL); } + address last_insn() const { return _last_insn; } + void set_last_insn(address a) { _last_insn = a; } + void clear_last_insn() { set_last_insn(NULL); } void free_strings() { if (!_code_strings.is_null()) {