--- old/src/cpu/aarch64/vm/aarch64.ad 2016-01-11 15:10:30.244904000 +0000 +++ new/src/cpu/aarch64/vm/aarch64.ad 2016-01-11 15:10:29.974904000 +0000 @@ -4442,11 +4442,7 @@ enc_class aarch64_enc_mov_byte_map_base(iRegP dst, immByteMapBase src) %{ MacroAssembler _masm(&cbuf); - address page = (address)$src$$constant; - Register dst_reg = as_Register($dst$$reg); - unsigned long off; - __ adrp(dst_reg, ExternalAddress(page), off); - assert(off == 0, "assumed offset == 0"); + __ load_byte_map_base($dst$$Register); %} enc_class aarch64_enc_mov_n(iRegN dst, immN src) %{ --- old/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp 2016-01-11 15:10:31.694904000 +0000 +++ new/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp 2016-01-11 15:10:31.434904000 +0000 @@ -1148,9 +1148,6 @@ #if INCLUDE_ALL_GCS -// Registers to be saved around calls to g1_wb_pre or g1_wb_post -#define G1_SAVE_REGS (RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2)) - case g1_pre_barrier_slow_id: { StubFrame f(sasm, "g1_pre_barrier", dont_gc_arguments); @@ -1192,10 +1189,10 @@ __ b(done); __ bind(runtime); - __ push(G1_SAVE_REGS, sp); + __ push_call_clobbered_registers(); f.load_argument(0, pre_val); __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread); - __ pop(G1_SAVE_REGS, sp); + __ pop_call_clobbered_registers(); __ bind(done); } break; @@ -1223,45 +1220,49 @@ Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf())); - const Register card_addr = rscratch2; - ExternalAddress cardtable((address) ct->byte_map_base); - - f.load_argument(0, card_addr); - __ lsr(card_addr, card_addr, CardTableModRefBS::card_shift); - unsigned long offset; - __ adrp(rscratch1, cardtable, offset); - __ add(card_addr, card_addr, rscratch1); - __ ldrb(rscratch1, Address(card_addr, offset)); + const Register card_offset = rscratch2; + // LR is free here, so we can use it to hold the byte_map_base. + const Register byte_map_base = lr; + + assert_different_registers(card_offset, byte_map_base, rscratch1); + + f.load_argument(0, card_offset); + __ lsr(card_offset, card_offset, CardTableModRefBS::card_shift); + __ load_byte_map_base(byte_map_base); + __ ldrb(rscratch1, Address(byte_map_base, card_offset)); __ cmpw(rscratch1, (int)G1SATBCardTableModRefBS::g1_young_card_val()); __ br(Assembler::EQ, done); assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0"); __ membar(Assembler::StoreLoad); - __ ldrb(rscratch1, Address(card_addr, offset)); + __ ldrb(rscratch1, Address(byte_map_base, card_offset)); __ cbzw(rscratch1, done); // storing region crossing non-NULL, card is clean. // dirty card and log. - __ strb(zr, Address(card_addr, offset)); + __ strb(zr, Address(byte_map_base, card_offset)); + + // Convert card offset into an address in card_addr + Register card_addr = card_offset; + __ add(card_addr, byte_map_base, card_addr); __ ldr(rscratch1, queue_index); __ cbz(rscratch1, runtime); __ sub(rscratch1, rscratch1, wordSize); __ str(rscratch1, queue_index); - const Register buffer_addr = r0; + // Reuse LR to hold buffer_addr + const Register buffer_addr = lr; - __ push(RegSet::of(r0, r1), sp); __ ldr(buffer_addr, buffer); __ str(card_addr, Address(buffer_addr, rscratch1)); - __ pop(RegSet::of(r0, r1), sp); __ b(done); __ bind(runtime); - __ push(G1_SAVE_REGS, sp); + __ push_call_clobbered_registers(); __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread); - __ pop(G1_SAVE_REGS, sp); + __ pop_call_clobbered_registers(); __ bind(done); } --- old/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp 2016-01-11 15:10:32.644904000 +0000 +++ new/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp 2016-01-11 15:10:32.384904000 +0000 @@ -2301,6 +2301,30 @@ } #endif +void MacroAssembler::push_call_clobbered_registers() { + push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); + + // Push v0-v7, v16-v31. + for (int i = 30; i >= 0; i -= 2) { + if (i <= v7->encoding() || i >= v16->encoding()) { + stpd(as_FloatRegister(i), as_FloatRegister(i+1), + Address(pre(sp, -2 * wordSize))); + } + } +} + +void MacroAssembler::pop_call_clobbered_registers() { + + for (int i = 0; i < 32; i += 2) { + if (i <= v7->encoding() || i >= v16->encoding()) { + ldpd(as_FloatRegister(i), as_FloatRegister(i+1), + Address(post(sp, 2 * wordSize))); + } + } + + pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); +} + void MacroAssembler::push_CPU_state(bool save_vectors) { push(0x3fffffff, sp); // integer registers except lr & sp @@ -3099,12 +3123,7 @@ assert(CardTableModRefBS::dirty_card_val() == 0, "must be"); - { - ExternalAddress cardtable((address) ct->byte_map_base); - unsigned long offset; - adrp(rscratch1, cardtable, offset); - assert(offset == 0, "byte_map_base is misaligned"); - } + load_byte_map_base(rscratch1); if (UseCondCardMark) { Label L_already_dirty; @@ -3596,12 +3615,10 @@ lsr(card_addr, store_addr, CardTableModRefBS::card_shift); - unsigned long offset; - adrp(tmp2, cardtable, offset); - // get the address of the card + load_byte_map_base(tmp2); add(card_addr, card_addr, tmp2); - ldrb(tmp2, Address(card_addr, offset)); + ldrb(tmp2, Address(card_addr)); cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val()); br(Assembler::EQ, done); @@ -3609,13 +3626,13 @@ membar(Assembler::StoreLoad); - ldrb(tmp2, Address(card_addr, offset)); + ldrb(tmp2, Address(card_addr)); cbzw(tmp2, done); // storing a region crossing, non-NULL oop, card is clean. // dirty card and log. - strb(zr, Address(card_addr, offset)); + strb(zr, Address(card_addr)); ldr(rscratch1, queue_index); cbz(rscratch1, runtime); @@ -3971,6 +3988,9 @@ long offset_low = dest_page - low_page; long offset_high = dest_page - high_page; + assert(is_valid_AArch64_address(dest.target()), "bad address"); + assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); + InstructionMark im(this); code_section()->relocate(inst_mark(), dest.rspec()); // 8143067: Ensure that the adrp can reach the dest from anywhere within @@ -3982,11 +4002,26 @@ long offset = dest_page - pc_page; offset = (offset & ((1<<20)-1)) << 12; _adrp(reg1, pc()+offset); - movk(reg1, ((unsigned long)dest.target() >> 32) & 0xffff, 32); + movk(reg1, (unsigned long)dest.target() >> 32, 32); } byte_offset = (unsigned long)dest.target() & 0xfff; } +void MacroAssembler::load_byte_map_base(Register reg) { + jbyte *byte_map_base = + ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base; + + if (is_valid_AArch64_address((address)byte_map_base)) { + // Strictly speaking the byte_map_base isn't an address at all, + // and it might even be negative. + unsigned long offset; + adrp(reg, ExternalAddress((address)byte_map_base), offset); + assert(offset == 0, "misaligned card table base"); + } else { + mov(reg, (uint64_t)byte_map_base); + } +} + void MacroAssembler::build_frame(int framesize) { assert(framesize > 0, "framesize must be > 0"); if (framesize < ((1 << 9) + 2 * wordSize)) { --- old/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp 2016-01-11 15:10:33.684904000 +0000 +++ new/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp 2016-01-11 15:10:33.434904000 +0000 @@ -444,6 +444,13 @@ void push(RegSet regs, Register stack) { if (regs.bits()) push(regs.bits(), stack); } void pop(RegSet regs, Register stack) { if (regs.bits()) pop(regs.bits(), stack); } + // Push and pop everything that might be clobbered by a native + // runtime call except rscratch1 and rscratch2. (They are always + // scratch, so we don't have to protect them.) Only save the lower + // 64 bits of each vector register. + void push_call_clobbered_registers(); + void pop_call_clobbered_registers(); + // now mov instructions for loading absolute addresses and 32 or // 64 bit integers @@ -1125,6 +1132,15 @@ // of your data. Address form_address(Register Rd, Register base, long byte_offset, int shift); + // Return true iff an address is within the 48-bit AArch64 address + // space. + bool is_valid_AArch64_address(address a) { + return ((uint64_t)a >> 48) == 0; + } + + // Load the base of the cardtable byte map into reg. + void load_byte_map_base(Register reg); + // Prolog generator routines to support switch between x86 code and // generated ARM code --- old/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp 2016-01-11 15:10:34.704904000 +0000 +++ new/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp 2016-01-11 15:10:34.444904000 +0000 @@ -744,7 +744,7 @@ __ sub(end, end, start); // number of bytes to copy const Register count = end; // 'end' register contains bytes count now - __ mov(scratch, (address)ct->byte_map_base); + __ load_byte_map_base(scratch); __ add(start, start, scratch); if (UseConcMarkSweepGC) { __ membar(__ StoreStore);