hotspot/src/cpu/sparc/vm/assembler_sparc.cpp

Print this page
rev 611 : Merge

*** 1,10 **** - #ifdef USE_PRAGMA_IDENT_SRC - #pragma ident "@(#)assembler_sparc.cpp 1.208 07/08/29 13:42:15 JVM" - #endif /* ! * Copyright 1997-2007 Sun Microsystems, Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. --- 1,7 ---- /* ! * Copyright 1997-2008 Sun Microsystems, Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation.
*** 131,140 **** --- 128,151 ---- int AbstractAssembler::code_fill_byte() { return 0x00; // illegal instruction 0x00000000 } + Assembler::Condition Assembler::reg_cond_to_cc_cond(Assembler::RCondition in) { + switch (in) { + case rc_z: return equal; + case rc_lez: return lessEqual; + case rc_lz: return less; + case rc_nz: return notEqual; + case rc_gz: return greater; + case rc_gez: return greaterEqual; + default: + ShouldNotReachHere(); + } + return equal; + } + // Generate a bunch 'o stuff (including v9's #ifndef PRODUCT void Assembler::test_v9() { add( G0, G1, G2 ); add( G3, 0, G4 );
*** 1214,1248 **** st_ptr(oop_result, vm_result_addr); } ! void MacroAssembler::store_check(Register tmp, Register obj) { ! // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.) ! ! /* $$$ This stuff needs to go into one of the BarrierSet generator ! functions. (The particular barrier sets will have to be friends of ! MacroAssembler, I guess.) */ ! BarrierSet* bs = Universe::heap()->barrier_set(); ! assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind"); ! CardTableModRefBS* ct = (CardTableModRefBS*)bs; ! assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); #ifdef _LP64 srlx(obj, CardTableModRefBS::card_shift, obj); #else srl(obj, CardTableModRefBS::card_shift, obj); #endif assert( tmp != obj, "need separate temp reg"); ! Address rs(tmp, (address)ct->byte_map_base); load_address(rs); stb(G0, rs.base(), obj); } - void MacroAssembler::store_check(Register tmp, Register obj, Register offset) { - store_check(tmp, obj); - } - // %%% Note: The following six instructions have been moved, // unchanged, from assembler_sparc.inline.hpp. // They will be refactored at a later date. void MacroAssembler::sethi(intptr_t imm22a, --- 1225,1247 ---- st_ptr(oop_result, vm_result_addr); } ! void MacroAssembler::card_table_write(jbyte* byte_map_base, ! Register tmp, Register obj) { #ifdef _LP64 srlx(obj, CardTableModRefBS::card_shift, obj); #else srl(obj, CardTableModRefBS::card_shift, obj); #endif assert( tmp != obj, "need separate temp reg"); ! Address rs(tmp, (address)byte_map_base); load_address(rs); stb(G0, rs.base(), obj); } // %%% Note: The following six instructions have been moved, // unchanged, from assembler_sparc.inline.hpp. // They will be refactored at a later date. void MacroAssembler::sethi(intptr_t imm22a,
*** 1524,1533 **** --- 1523,1547 ---- assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); int oop_index = oop_recorder()->find_index(obj); return Address(d, address(obj), oop_Relocation::spec(oop_index)); } + void MacroAssembler::set_narrow_oop(jobject obj, Register d) { + assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); + int oop_index = oop_recorder()->find_index(obj); + RelocationHolder rspec = oop_Relocation::spec(oop_index); + + assert_not_delayed(); + // Relocation with special format (see relocInfo_sparc.hpp). + relocate(rspec, 1); + // Assembler::sethi(0x3fffff, d); + emit_long( op(branch_op) | rd(d) | op2(sethi_op2) | hi22(0x3fffff) ); + // Don't add relocation for 'add'. Do patching during 'sethi' processing. + add(d, 0x3ff, d); + + } + void MacroAssembler::align(int modulus) { while (offset() % modulus != 0) nop(); }
*** 1649,1663 **** // plausibility check for oops if (!VerifyOops) return; if (reg == G0) return; // always NULL, which is always an oop ! char buffer[16]; sprintf(buffer, "%d", line); ! int len = strlen(file) + strlen(msg) + 1 + 4 + strlen(buffer); char * real_msg = new char[len]; ! sprintf(real_msg, "%s (%s:%d)", msg, file, line); // Call indirectly to solve generation ordering problem Address a(O7, (address)StubRoutines::verify_oop_subroutine_entry_address()); // Make some space on stack above the current register window. --- 1663,1687 ---- // plausibility check for oops if (!VerifyOops) return; if (reg == G0) return; // always NULL, which is always an oop ! char buffer[64]; ! #ifdef COMPILER1 ! if (CommentedAssembly) { ! snprintf(buffer, sizeof(buffer), "verify_oop at %d", offset()); ! block_comment(buffer); ! } ! #endif ! ! int len = strlen(file) + strlen(msg) + 1 + 4; sprintf(buffer, "%d", line); ! len += strlen(buffer); ! sprintf(buffer, " at offset %d ", offset()); ! len += strlen(buffer); char * real_msg = new char[len]; ! sprintf(real_msg, "%s%s(%s:%d)", msg, buffer, file, line); // Call indirectly to solve generation ordering problem Address a(O7, (address)StubRoutines::verify_oop_subroutine_entry_address()); // Make some space on stack above the current register window.
*** 1780,1800 **** delayed()->nop(); } // Check the klassOop of this object for being in the right area of memory. // Cannot do the load in the delay above slot in case O0 is null ! ld_ptr(Address(O0_obj, 0, oopDesc::klass_offset_in_bytes()), O0_obj); // assert((klass & klass_mask) == klass_bits); if( Universe::verify_klass_mask() != Universe::verify_oop_mask() ) set(Universe::verify_klass_mask(), O2_mask); if( Universe::verify_klass_bits() != Universe::verify_oop_bits() ) set(Universe::verify_klass_bits(), O3_bits); and3(O0_obj, O2_mask, O4_temp); cmp(O4_temp, O3_bits); brx(notEqual, false, pn, fail); // Check the klass's klass ! delayed()->ld_ptr(Address(O0_obj, 0, oopDesc::klass_offset_in_bytes()), O0_obj); and3(O0_obj, O2_mask, O4_temp); cmp(O4_temp, O3_bits); brx(notEqual, false, pn, fail); delayed()->wrccr( O5_save_flags ); // Restore CCR's --- 1804,1825 ---- delayed()->nop(); } // Check the klassOop of this object for being in the right area of memory. // Cannot do the load in the delay above slot in case O0 is null ! load_klass(O0_obj, O0_obj); // assert((klass & klass_mask) == klass_bits); if( Universe::verify_klass_mask() != Universe::verify_oop_mask() ) set(Universe::verify_klass_mask(), O2_mask); if( Universe::verify_klass_bits() != Universe::verify_oop_bits() ) set(Universe::verify_klass_bits(), O3_bits); and3(O0_obj, O2_mask, O4_temp); cmp(O4_temp, O3_bits); brx(notEqual, false, pn, fail); + delayed()->nop(); // Check the klass's klass ! load_klass(O0_obj, O0_obj); and3(O0_obj, O2_mask, O4_temp); cmp(O4_temp, O3_bits); brx(notEqual, false, pn, fail); delayed()->wrccr( O5_save_flags ); // Restore CCR's
*** 2044,2053 **** --- 2069,2099 ---- tst(s1); br ( notZero, a, p, L ); #endif } + void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p, + Register s1, address d, + relocInfo::relocType rt ) { + if (VM_Version::v9_instructions_work()) { + bpr(rc, a, p, s1, d, rt); + } else { + tst(s1); + br(reg_cond_to_cc_cond(rc), a, p, d, rt); + } + } + + void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p, + Register s1, Label& L ) { + if (VM_Version::v9_instructions_work()) { + bpr(rc, a, p, s1, L); + } else { + tst(s1); + br(reg_cond_to_cc_cond(rc), a, p, L); + } + } + // instruction sequences factored across compiler & interpreter void MacroAssembler::lcmp( Register Ra_hi, Register Ra_low,
*** 2567,2577 **** restore(); } } ! void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg, Register temp_reg, Label& done, Label* slow_case, BiasedLockingCounters* counters) { assert(UseBiasedLocking, "why call this otherwise?"); if (PrintBiasedLockingStatistics) { --- 2613,2624 ---- restore(); } } ! void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg, ! Register temp_reg, Label& done, Label* slow_case, BiasedLockingCounters* counters) { assert(UseBiasedLocking, "why call this otherwise?"); if (PrintBiasedLockingStatistics) {
*** 2589,2600 **** // pointers to allow age to be placed into low bits assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); and3(mark_reg, markOopDesc::biased_lock_mask_in_place, temp_reg); cmp(temp_reg, markOopDesc::biased_lock_pattern); brx(Assembler::notEqual, false, Assembler::pn, cas_label); ! delayed()->ld_ptr(Address(obj_reg, 0, oopDesc::klass_offset_in_bytes()), temp_reg); ld_ptr(Address(temp_reg, 0, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg); or3(G2_thread, temp_reg, temp_reg); xor3(mark_reg, temp_reg, temp_reg); andcc(temp_reg, ~((int) markOopDesc::age_mask_in_place), temp_reg); if (counters != NULL) { --- 2636,2648 ---- // pointers to allow age to be placed into low bits assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); and3(mark_reg, markOopDesc::biased_lock_mask_in_place, temp_reg); cmp(temp_reg, markOopDesc::biased_lock_pattern); brx(Assembler::notEqual, false, Assembler::pn, cas_label); + delayed()->nop(); ! load_klass(obj_reg, temp_reg); ld_ptr(Address(temp_reg, 0, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg); or3(G2_thread, temp_reg, temp_reg); xor3(mark_reg, temp_reg, temp_reg); andcc(temp_reg, ~((int) markOopDesc::age_mask_in_place), temp_reg); if (counters != NULL) {
*** 2642,2653 **** // don't accidentally blow away another thread's valid bias. delayed()->and3(mark_reg, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place, mark_reg); or3(G2_thread, mark_reg, temp_reg); ! casx_under_lock(mark_addr.base(), mark_reg, temp_reg, ! (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr()); // If the biasing toward our thread failed, this means that // another thread succeeded in biasing it toward itself and we // need to revoke that bias. The revocation will occur in the // interpreter runtime in the slow case. cmp(mark_reg, temp_reg); --- 2690,2700 ---- // don't accidentally blow away another thread's valid bias. delayed()->and3(mark_reg, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place, mark_reg); or3(G2_thread, mark_reg, temp_reg); ! casn(mark_addr.base(), mark_reg, temp_reg); // If the biasing toward our thread failed, this means that // another thread succeeded in biasing it toward itself and we // need to revoke that bias. The revocation will occur in the // interpreter runtime in the slow case. cmp(mark_reg, temp_reg);
*** 2669,2683 **** // bias in the current epoch. In other words, we allow transfer of // the bias from one thread to another directly in this situation. // // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. ! ld_ptr(Address(obj_reg, 0, oopDesc::klass_offset_in_bytes()), temp_reg); ld_ptr(Address(temp_reg, 0, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg); or3(G2_thread, temp_reg, temp_reg); ! casx_under_lock(mark_addr.base(), mark_reg, temp_reg, ! (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr()); // If the biasing toward our thread failed, this means that // another thread succeeded in biasing it toward itself and we // need to revoke that bias. The revocation will occur in the // interpreter runtime in the slow case. cmp(mark_reg, temp_reg); --- 2716,2729 ---- // bias in the current epoch. In other words, we allow transfer of // the bias from one thread to another directly in this situation. // // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. ! load_klass(obj_reg, temp_reg); ld_ptr(Address(temp_reg, 0, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg); or3(G2_thread, temp_reg, temp_reg); ! casn(mark_addr.base(), mark_reg, temp_reg); // If the biasing toward our thread failed, this means that // another thread succeeded in biasing it toward itself and we // need to revoke that bias. The revocation will occur in the // interpreter runtime in the slow case. cmp(mark_reg, temp_reg);
*** 2701,2714 **** // bias of this particular object, so it's okay to continue in the // normal locking code. // // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. ! ld_ptr(Address(obj_reg, 0, oopDesc::klass_offset_in_bytes()), temp_reg); ld_ptr(Address(temp_reg, 0, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg); ! casx_under_lock(mark_addr.base(), mark_reg, temp_reg, ! (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr()); // Fall through to the normal CAS-based lock, because no matter what // the result of the above CAS, some thread must have succeeded in // removing the bias bit from the object's header. if (counters != NULL) { cmp(mark_reg, temp_reg); --- 2747,2759 ---- // bias of this particular object, so it's okay to continue in the // normal locking code. // // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. ! load_klass(obj_reg, temp_reg); ld_ptr(Address(temp_reg, 0, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg); ! casn(mark_addr.base(), mark_reg, temp_reg); // Fall through to the normal CAS-based lock, because no matter what // the result of the above CAS, some thread must have succeeded in // removing the bias bit from the object's header. if (counters != NULL) { cmp(mark_reg, temp_reg);
*** 2766,2777 **** // and compiler_unlock_object. Critically, the key factor is code size, not path // length. (Simply experiments to pad CLO with unexecuted NOPs demonstrte the // effect). ! void MacroAssembler::compiler_lock_object(Register Roop, Register Rmark, Register Rbox, Register Rscratch, ! BiasedLockingCounters* counters) { Address mark_addr(Roop, 0, oopDesc::mark_offset_in_bytes()); verify_oop(Roop); Label done ; --- 2811,2824 ---- // and compiler_unlock_object. Critically, the key factor is code size, not path // length. (Simply experiments to pad CLO with unexecuted NOPs demonstrte the // effect). ! void MacroAssembler::compiler_lock_object(Register Roop, Register Rmark, ! Register Rbox, Register Rscratch, ! BiasedLockingCounters* counters, ! bool try_bias) { Address mark_addr(Roop, 0, oopDesc::mark_offset_in_bytes()); verify_oop(Roop); Label done ;
*** 2789,2799 **** if (EmitSync & 2) { // Fetch object's markword ld_ptr(mark_addr, Rmark); ! if (UseBiasedLocking) { biased_locking_enter(Roop, Rmark, Rscratch, done, NULL, counters); } // Save Rbox in Rscratch to be used for the cas operation mov(Rbox, Rscratch); --- 2836,2846 ---- if (EmitSync & 2) { // Fetch object's markword ld_ptr(mark_addr, Rmark); ! if (try_bias) { biased_locking_enter(Roop, Rmark, Rscratch, done, NULL, counters); } // Save Rbox in Rscratch to be used for the cas operation mov(Rbox, Rscratch);
*** 2832,2842 **** if (EmitSync & 256) { Label IsInflated ; ld_ptr (mark_addr, Rmark); // fetch obj->mark // Triage: biased, stack-locked, neutral, inflated ! if (UseBiasedLocking) { biased_locking_enter(Roop, Rmark, Rscratch, done, NULL, counters); // Invariant: if control reaches this point in the emitted stream // then Rmark has not been modified. } --- 2879,2889 ---- if (EmitSync & 256) { Label IsInflated ; ld_ptr (mark_addr, Rmark); // fetch obj->mark // Triage: biased, stack-locked, neutral, inflated ! if (try_bias) { biased_locking_enter(Roop, Rmark, Rscratch, done, NULL, counters); // Invariant: if control reaches this point in the emitted stream // then Rmark has not been modified. }
*** 2896,2906 **** // prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads) ; ld_ptr (mark_addr, Rmark); // fetch obj->mark // Triage: biased, stack-locked, neutral, inflated ! if (UseBiasedLocking) { biased_locking_enter(Roop, Rmark, Rscratch, done, NULL, counters); // Invariant: if control reaches this point in the emitted stream // then Rmark has not been modified. } andcc (Rmark, 2, G0) ; --- 2943,2953 ---- // prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads) ; ld_ptr (mark_addr, Rmark); // fetch obj->mark // Triage: biased, stack-locked, neutral, inflated ! if (try_bias) { biased_locking_enter(Roop, Rmark, Rscratch, done, NULL, counters); // Invariant: if control reaches this point in the emitted stream // then Rmark has not been modified. } andcc (Rmark, 2, G0) ;
*** 2990,3011 **** } bind (done) ; } ! void MacroAssembler::compiler_unlock_object(Register Roop, Register Rmark, Register Rbox, Register Rscratch) { Address mark_addr(Roop, 0, oopDesc::mark_offset_in_bytes()); Label done ; if (EmitSync & 4) { cmp (SP, G0) ; return ; } if (EmitSync & 8) { ! if (UseBiasedLocking) { biased_locking_exit(mark_addr, Rscratch, done); } // Test first if it is a fast recursive unlock ld_ptr(Rbox, BasicLock::displaced_header_offset_in_bytes(), Rmark); --- 3037,3060 ---- } bind (done) ; } ! void MacroAssembler::compiler_unlock_object(Register Roop, Register Rmark, ! Register Rbox, Register Rscratch, ! bool try_bias) { Address mark_addr(Roop, 0, oopDesc::mark_offset_in_bytes()); Label done ; if (EmitSync & 4) { cmp (SP, G0) ; return ; } if (EmitSync & 8) { ! if (try_bias) { biased_locking_exit(mark_addr, Rscratch, done); } // Test first if it is a fast recursive unlock ld_ptr(Rbox, BasicLock::displaced_header_offset_in_bytes(), Rmark);
*** 3028,3038 **** // is too large performance rolls abruptly off a cliff. // This could be related to inlining policies, code cache management, or // I$ effects. Label LStacked ; ! if (UseBiasedLocking) { // TODO: eliminate redundant LDs of obj->mark biased_locking_exit(mark_addr, Rscratch, done); } ld_ptr (Roop, oopDesc::mark_offset_in_bytes(), Rmark) ; --- 3077,3087 ---- // is too large performance rolls abruptly off a cliff. // This could be related to inlining policies, code cache management, or // I$ effects. Label LStacked ; ! if (try_bias) { // TODO: eliminate redundant LDs of obj->mark biased_locking_exit(mark_addr, Rscratch, done); } ld_ptr (Roop, oopDesc::mark_offset_in_bytes(), Rmark) ;
*** 3225,3234 **** --- 3274,3288 ---- // make sure arguments make sense assert_different_registers(obj, var_size_in_bytes, t1, t2); assert(0 <= con_size_in_bytes && Assembler::is_simm13(con_size_in_bytes), "illegal object size"); assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); + if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { + // No allocation in the shared eden. + br(Assembler::always, false, Assembler::pt, slow_case); + delayed()->nop(); + } else { // get eden boundaries // note: we need both top & top_addr! const Register top_addr = t1; const Register end = t2;
*** 3287,3296 **** --- 3341,3351 ---- delayed()->nop(); stop("eden top is not properly aligned"); bind(L); } #endif // ASSERT + } } void MacroAssembler::tlab_allocate( Register obj, // result: pointer to object after successful allocation
*** 3405,3421 **** delayed()->nop(); set((intptr_t)markOopDesc::prototype()->copy_set_hash(0x2), t2); st_ptr(t2, top, oopDesc::mark_offset_in_bytes()); // set up the mark word // set klass to intArrayKlass - set((intptr_t)Universe::intArrayKlassObj_addr(), t2); - ld_ptr(t2, 0, t2); - st_ptr(t2, top, oopDesc::klass_offset_in_bytes()); sub(t1, typeArrayOopDesc::header_size(T_INT), t1); add(t1, ThreadLocalAllocBuffer::alignment_reserve(), t1); sll_ptr(t1, log2_intptr(HeapWordSize/sizeof(jint)), t1); st(t1, top, arrayOopDesc::length_offset_in_bytes()); verify_oop(top); // refill the tlab with an eden allocation bind(do_refill); ld_ptr(G2_thread, in_bytes(JavaThread::tlab_size_offset()), t1); --- 3460,3478 ---- delayed()->nop(); set((intptr_t)markOopDesc::prototype()->copy_set_hash(0x2), t2); st_ptr(t2, top, oopDesc::mark_offset_in_bytes()); // set up the mark word // set klass to intArrayKlass sub(t1, typeArrayOopDesc::header_size(T_INT), t1); add(t1, ThreadLocalAllocBuffer::alignment_reserve(), t1); sll_ptr(t1, log2_intptr(HeapWordSize/sizeof(jint)), t1); st(t1, top, arrayOopDesc::length_offset_in_bytes()); + set((intptr_t)Universe::intArrayKlassObj_addr(), t2); + ld_ptr(t2, 0, t2); + // store klass last. concurrent gcs assumes klass length is valid if + // klass field is not null. + store_klass(t2, top); verify_oop(top); // refill the tlab with an eden allocation bind(do_refill); ld_ptr(G2_thread, in_bytes(JavaThread::tlab_size_offset()), t1);
*** 3535,3539 **** --- 3592,4211 ---- for (int i = 0; i< StackShadowPages-1; i++) { set((-i*offset)+STACK_BIAS, Rscratch); st(G0, Rtsp, Rscratch); } } + + /////////////////////////////////////////////////////////////////////////////////// + #ifndef SERIALGC + + static uint num_stores = 0; + static uint num_null_pre_stores = 0; + + static void count_null_pre_vals(void* pre_val) { + num_stores++; + if (pre_val == NULL) num_null_pre_stores++; + if ((num_stores % 1000000) == 0) { + tty->print_cr(UINT32_FORMAT " stores, " UINT32_FORMAT " (%5.2f%%) with null pre-vals.", + num_stores, num_null_pre_stores, + 100.0*(float)num_null_pre_stores/(float)num_stores); + } + } + + static address satb_log_enqueue_with_frame = 0; + static u_char* satb_log_enqueue_with_frame_end = 0; + + static address satb_log_enqueue_frameless = 0; + static u_char* satb_log_enqueue_frameless_end = 0; + + static int EnqueueCodeSize = 128 DEBUG_ONLY( + 256); // Instructions? + + // The calls to this don't work. We'd need to do a fair amount of work to + // make it work. + static void check_index(int ind) { + assert(0 <= ind && ind <= 64*K && ((ind % oopSize) == 0), + "Invariants.") + } + + static void generate_satb_log_enqueue(bool with_frame) { + BufferBlob* bb = BufferBlob::create("enqueue_with_frame", EnqueueCodeSize); + CodeBuffer buf(bb->instructions_begin(), bb->instructions_size()); + MacroAssembler masm(&buf); + address start = masm.pc(); + Register pre_val; + + Label refill, restart; + if (with_frame) { + masm.save_frame(0); + pre_val = I0; // Was O0 before the save. + } else { + pre_val = O0; + } + int satb_q_index_byte_offset = + in_bytes(JavaThread::satb_mark_queue_offset() + + PtrQueue::byte_offset_of_index()); + int satb_q_buf_byte_offset = + in_bytes(JavaThread::satb_mark_queue_offset() + + PtrQueue::byte_offset_of_buf()); + assert(in_bytes(PtrQueue::byte_width_of_index()) == sizeof(intptr_t) && + in_bytes(PtrQueue::byte_width_of_buf()) == sizeof(intptr_t), + "check sizes in assembly below"); + + masm.bind(restart); + masm.ld_ptr(G2_thread, satb_q_index_byte_offset, L0); + + masm.br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill); + // If the branch is taken, no harm in executing this in the delay slot. + masm.delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1); + masm.sub(L0, oopSize, L0); + + masm.st_ptr(pre_val, L1, L0); // [_buf + index] := I0 + if (!with_frame) { + // Use return-from-leaf + masm.retl(); + masm.delayed()->st_ptr(L0, G2_thread, satb_q_index_byte_offset); + } else { + // Not delayed. + masm.st_ptr(L0, G2_thread, satb_q_index_byte_offset); + } + if (with_frame) { + masm.ret(); + masm.delayed()->restore(); + } + masm.bind(refill); + + address handle_zero = + CAST_FROM_FN_PTR(address, + &SATBMarkQueueSet::handle_zero_index_for_thread); + // This should be rare enough that we can afford to save all the + // scratch registers that the calling context might be using. + masm.mov(G1_scratch, L0); + masm.mov(G3_scratch, L1); + masm.mov(G4, L2); + // We need the value of O0 above (for the write into the buffer), so we + // save and restore it. + masm.mov(O0, L3); + // Since the call will overwrite O7, we save and restore that, as well. + masm.mov(O7, L4); + masm.call_VM_leaf(L5, handle_zero, G2_thread); + masm.mov(L0, G1_scratch); + masm.mov(L1, G3_scratch); + masm.mov(L2, G4); + masm.mov(L3, O0); + masm.br(Assembler::always, /*annul*/false, Assembler::pt, restart); + masm.delayed()->mov(L4, O7); + + if (with_frame) { + satb_log_enqueue_with_frame = start; + satb_log_enqueue_with_frame_end = masm.pc(); + } else { + satb_log_enqueue_frameless = start; + satb_log_enqueue_frameless_end = masm.pc(); + } + } + + static inline void generate_satb_log_enqueue_if_necessary(bool with_frame) { + if (with_frame) { + if (satb_log_enqueue_with_frame == 0) { + generate_satb_log_enqueue(with_frame); + assert(satb_log_enqueue_with_frame != 0, "postcondition."); + if (G1SATBPrintStubs) { + tty->print_cr("Generated with-frame satb enqueue:"); + Disassembler::decode((u_char*)satb_log_enqueue_with_frame, + satb_log_enqueue_with_frame_end, + tty); + } + } + } else { + if (satb_log_enqueue_frameless == 0) { + generate_satb_log_enqueue(with_frame); + assert(satb_log_enqueue_frameless != 0, "postcondition."); + if (G1SATBPrintStubs) { + tty->print_cr("Generated frameless satb enqueue:"); + Disassembler::decode((u_char*)satb_log_enqueue_frameless, + satb_log_enqueue_frameless_end, + tty); + } + } + } + } + + void MacroAssembler::g1_write_barrier_pre(Register obj, Register index, int offset, Register tmp, bool preserve_o_regs) { + assert(offset == 0 || index == noreg, "choose one"); + + if (G1DisablePreBarrier) return; + // satb_log_barrier(tmp, obj, offset, preserve_o_regs); + Label filtered; + // satb_log_barrier_work0(tmp, filtered); + if (in_bytes(PtrQueue::byte_width_of_active()) == 4) { + ld(G2, + in_bytes(JavaThread::satb_mark_queue_offset() + + PtrQueue::byte_offset_of_active()), + tmp); + } else { + guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, + "Assumption"); + ldsb(G2, + in_bytes(JavaThread::satb_mark_queue_offset() + + PtrQueue::byte_offset_of_active()), + tmp); + } + // Check on whether to annul. + br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered); + delayed() -> nop(); + + // satb_log_barrier_work1(tmp, offset); + if (index == noreg) { + if (Assembler::is_simm13(offset)) { + ld_ptr(obj, offset, tmp); + } else { + set(offset, tmp); + ld_ptr(obj, tmp, tmp); + } + } else { + ld_ptr(obj, index, tmp); + } + + // satb_log_barrier_work2(obj, tmp, offset); + + // satb_log_barrier_work3(tmp, filtered, preserve_o_regs); + + const Register pre_val = tmp; + + if (G1SATBBarrierPrintNullPreVals) { + save_frame(0); + mov(pre_val, O0); + // Save G-regs that target may use. + mov(G1, L1); + mov(G2, L2); + mov(G3, L3); + mov(G4, L4); + mov(G5, L5); + call(CAST_FROM_FN_PTR(address, &count_null_pre_vals)); + delayed()->nop(); + // Restore G-regs that target may have used. + mov(L1, G1); + mov(L2, G2); + mov(L3, G3); + mov(L4, G4); + mov(L5, G5); + restore(G0, G0, G0); + } + + // Check on whether to annul. + br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, pre_val, filtered); + delayed() -> nop(); + + // OK, it's not filtered, so we'll need to call enqueue. In the normal + // case, pre_val will be a scratch G-reg, but there's some cases in which + // it's an O-reg. In the first case, do a normal call. In the latter, + // do a save here and call the frameless version. + + guarantee(pre_val->is_global() || pre_val->is_out(), + "Or we need to think harder."); + if (pre_val->is_global() && !preserve_o_regs) { + generate_satb_log_enqueue_if_necessary(true); // with frame. + call(satb_log_enqueue_with_frame); + delayed()->mov(pre_val, O0); + } else { + generate_satb_log_enqueue_if_necessary(false); // with frameless. + save_frame(0); + call(satb_log_enqueue_frameless); + delayed()->mov(pre_val->after_save(), O0); + restore(); + } + + bind(filtered); + } + + static jint num_ct_writes = 0; + static jint num_ct_writes_filtered_in_hr = 0; + static jint num_ct_writes_filtered_null = 0; + static G1CollectedHeap* g1 = NULL; + + static Thread* count_ct_writes(void* filter_val, void* new_val) { + Atomic::inc(&num_ct_writes); + if (filter_val == NULL) { + Atomic::inc(&num_ct_writes_filtered_in_hr); + } else if (new_val == NULL) { + Atomic::inc(&num_ct_writes_filtered_null); + } else { + if (g1 == NULL) { + g1 = G1CollectedHeap::heap(); + } + } + if ((num_ct_writes % 1000000) == 0) { + jint num_ct_writes_filtered = + num_ct_writes_filtered_in_hr + + num_ct_writes_filtered_null; + + tty->print_cr("%d potential CT writes: %5.2f%% filtered\n" + " (%5.2f%% intra-HR, %5.2f%% null).", + num_ct_writes, + 100.0*(float)num_ct_writes_filtered/(float)num_ct_writes, + 100.0*(float)num_ct_writes_filtered_in_hr/ + (float)num_ct_writes, + 100.0*(float)num_ct_writes_filtered_null/ + (float)num_ct_writes); + } + return Thread::current(); + } + + static address dirty_card_log_enqueue = 0; + static u_char* dirty_card_log_enqueue_end = 0; + + // This gets to assume that o0 contains the object address. + static void generate_dirty_card_log_enqueue(jbyte* byte_map_base) { + BufferBlob* bb = BufferBlob::create("dirty_card_enqueue", EnqueueCodeSize*2); + CodeBuffer buf(bb->instructions_begin(), bb->instructions_size()); + MacroAssembler masm(&buf); + address start = masm.pc(); + + Label not_already_dirty, restart, refill; + + #ifdef _LP64 + masm.srlx(O0, CardTableModRefBS::card_shift, O0); + #else + masm.srl(O0, CardTableModRefBS::card_shift, O0); + #endif + Address rs(O1, (address)byte_map_base); + masm.load_address(rs); // O1 := <card table base> + masm.ldub(O0, O1, O2); // O2 := [O0 + O1] + + masm.br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt, + O2, not_already_dirty); + // Get O1 + O2 into a reg by itself -- useful in the take-the-branch + // case, harmless if not. + masm.delayed()->add(O0, O1, O3); + + // We didn't take the branch, so we're already dirty: return. + // Use return-from-leaf + masm.retl(); + masm.delayed()->nop(); + + // Not dirty. + masm.bind(not_already_dirty); + // First, dirty it. + masm.stb(G0, O3, G0); // [cardPtr] := 0 (i.e., dirty). + int dirty_card_q_index_byte_offset = + in_bytes(JavaThread::dirty_card_queue_offset() + + PtrQueue::byte_offset_of_index()); + int dirty_card_q_buf_byte_offset = + in_bytes(JavaThread::dirty_card_queue_offset() + + PtrQueue::byte_offset_of_buf()); + masm.bind(restart); + masm.ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0); + + masm.br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, + L0, refill); + // If the branch is taken, no harm in executing this in the delay slot. + masm.delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1); + masm.sub(L0, oopSize, L0); + + masm.st_ptr(O3, L1, L0); // [_buf + index] := I0 + // Use return-from-leaf + masm.retl(); + masm.delayed()->st_ptr(L0, G2_thread, dirty_card_q_index_byte_offset); + + masm.bind(refill); + address handle_zero = + CAST_FROM_FN_PTR(address, + &DirtyCardQueueSet::handle_zero_index_for_thread); + // This should be rare enough that we can afford to save all the + // scratch registers that the calling context might be using. + masm.mov(G1_scratch, L3); + masm.mov(G3_scratch, L5); + // We need the value of O3 above (for the write into the buffer), so we + // save and restore it. + masm.mov(O3, L6); + // Since the call will overwrite O7, we save and restore that, as well. + masm.mov(O7, L4); + + masm.call_VM_leaf(L7_thread_cache, handle_zero, G2_thread); + masm.mov(L3, G1_scratch); + masm.mov(L5, G3_scratch); + masm.mov(L6, O3); + masm.br(Assembler::always, /*annul*/false, Assembler::pt, restart); + masm.delayed()->mov(L4, O7); + + dirty_card_log_enqueue = start; + dirty_card_log_enqueue_end = masm.pc(); + // XXX Should have a guarantee here about not going off the end! + // Does it already do so? Do an experiment... + } + + static inline void + generate_dirty_card_log_enqueue_if_necessary(jbyte* byte_map_base) { + if (dirty_card_log_enqueue == 0) { + generate_dirty_card_log_enqueue(byte_map_base); + assert(dirty_card_log_enqueue != 0, "postcondition."); + if (G1SATBPrintStubs) { + tty->print_cr("Generated dirty_card enqueue:"); + Disassembler::decode((u_char*)dirty_card_log_enqueue, + dirty_card_log_enqueue_end, + tty); + } + } + } + + + void MacroAssembler::g1_write_barrier_post(Register store_addr, Register new_val, Register tmp) { + + Label filtered; + MacroAssembler* post_filter_masm = this; + + if (new_val == G0) return; + if (G1DisablePostBarrier) return; + + G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set(); + assert(bs->kind() == BarrierSet::G1SATBCT || + bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier"); + if (G1RSBarrierRegionFilter) { + xor3(store_addr, new_val, tmp); + #ifdef _LP64 + srlx(tmp, HeapRegion::LogOfHRGrainBytes, tmp); + #else + srl(tmp, HeapRegion::LogOfHRGrainBytes, tmp); + #endif + if (G1PrintCTFilterStats) { + guarantee(tmp->is_global(), "Or stats won't work..."); + // This is a sleazy hack: I'm temporarily hijacking G2, which I + // promise to restore. + mov(new_val, G2); + save_frame(0); + mov(tmp, O0); + mov(G2, O1); + // Save G-regs that target may use. + mov(G1, L1); + mov(G2, L2); + mov(G3, L3); + mov(G4, L4); + mov(G5, L5); + call(CAST_FROM_FN_PTR(address, &count_ct_writes)); + delayed()->nop(); + mov(O0, G2); + // Restore G-regs that target may have used. + mov(L1, G1); + mov(L3, G3); + mov(L4, G4); + mov(L5, G5); + restore(G0, G0, G0); + } + // XXX Should I predict this taken or not? Does it mattern? + br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered); + delayed()->nop(); + } + + // Now we decide how to generate the card table write. If we're + // enqueueing, we call out to a generated function. Otherwise, we do it + // inline here. + + if (G1RSBarrierUseQueue) { + // If the "store_addr" register is an "in" or "local" register, move it to + // a scratch reg so we can pass it as an argument. + bool use_scr = !(store_addr->is_global() || store_addr->is_out()); + // Pick a scratch register different from "tmp". + Register scr = (tmp == G1_scratch ? G3_scratch : G1_scratch); + // Make sure we use up the delay slot! + if (use_scr) { + post_filter_masm->mov(store_addr, scr); + } else { + post_filter_masm->nop(); + } + generate_dirty_card_log_enqueue_if_necessary(bs->byte_map_base); + save_frame(0); + call(dirty_card_log_enqueue); + if (use_scr) { + delayed()->mov(scr, O0); + } else { + delayed()->mov(store_addr->after_save(), O0); + } + restore(); + + } else { + + #ifdef _LP64 + post_filter_masm->srlx(store_addr, CardTableModRefBS::card_shift, store_addr); + #else + post_filter_masm->srl(store_addr, CardTableModRefBS::card_shift, store_addr); + #endif + assert( tmp != store_addr, "need separate temp reg"); + Address rs(tmp, (address)bs->byte_map_base); + load_address(rs); + stb(G0, rs.base(), store_addr); + } + + bind(filtered); + + } + + #endif // SERIALGC + /////////////////////////////////////////////////////////////////////////////////// + + void MacroAssembler::card_write_barrier_post(Register store_addr, Register new_val, Register tmp) { + // If we're writing constant NULL, we can skip the write barrier. + if (new_val == G0) return; + CardTableModRefBS* bs = (CardTableModRefBS*) Universe::heap()->barrier_set(); + assert(bs->kind() == BarrierSet::CardTableModRef || + bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); + card_table_write(bs->byte_map_base, tmp, store_addr); + } + + void MacroAssembler::load_klass(Register src_oop, Register klass) { + // The number of bytes in this code is used by + // MachCallDynamicJavaNode::ret_addr_offset() + // if this changes, change that. + if (UseCompressedOops) { + lduw(src_oop, oopDesc::klass_offset_in_bytes(), klass); + decode_heap_oop_not_null(klass); + } else { + ld_ptr(src_oop, oopDesc::klass_offset_in_bytes(), klass); + } + } + + void MacroAssembler::store_klass(Register klass, Register dst_oop) { + if (UseCompressedOops) { + assert(dst_oop != klass, "not enough registers"); + encode_heap_oop_not_null(klass); + st(klass, dst_oop, oopDesc::klass_offset_in_bytes()); + } else { + st_ptr(klass, dst_oop, oopDesc::klass_offset_in_bytes()); + } + } + + void MacroAssembler::store_klass_gap(Register s, Register d) { + if (UseCompressedOops) { + assert(s != d, "not enough registers"); + st(s, d, oopDesc::klass_gap_offset_in_bytes()); + } + } + + void MacroAssembler::load_heap_oop(const Address& s, Register d, int offset) { + if (UseCompressedOops) { + lduw(s, d, offset); + decode_heap_oop(d); + } else { + ld_ptr(s, d, offset); + } + } + + void MacroAssembler::load_heap_oop(Register s1, Register s2, Register d) { + if (UseCompressedOops) { + lduw(s1, s2, d); + decode_heap_oop(d, d); + } else { + ld_ptr(s1, s2, d); + } + } + + void MacroAssembler::load_heap_oop(Register s1, int simm13a, Register d) { + if (UseCompressedOops) { + lduw(s1, simm13a, d); + decode_heap_oop(d, d); + } else { + ld_ptr(s1, simm13a, d); + } + } + + void MacroAssembler::store_heap_oop(Register d, Register s1, Register s2) { + if (UseCompressedOops) { + assert(s1 != d && s2 != d, "not enough registers"); + encode_heap_oop(d); + st(d, s1, s2); + } else { + st_ptr(d, s1, s2); + } + } + + void MacroAssembler::store_heap_oop(Register d, Register s1, int simm13a) { + if (UseCompressedOops) { + assert(s1 != d, "not enough registers"); + encode_heap_oop(d); + st(d, s1, simm13a); + } else { + st_ptr(d, s1, simm13a); + } + } + + void MacroAssembler::store_heap_oop(Register d, const Address& a, int offset) { + if (UseCompressedOops) { + assert(a.base() != d, "not enough registers"); + encode_heap_oop(d); + st(d, a, offset); + } else { + st_ptr(d, a, offset); + } + } + + + void MacroAssembler::encode_heap_oop(Register src, Register dst) { + assert (UseCompressedOops, "must be compressed"); + verify_oop(src); + Label done; + if (src == dst) { + // optimize for frequent case src == dst + bpr(rc_nz, true, Assembler::pt, src, done); + delayed() -> sub(src, G6_heapbase, dst); // annuled if not taken + bind(done); + srlx(src, LogMinObjAlignmentInBytes, dst); + } else { + bpr(rc_z, false, Assembler::pn, src, done); + delayed() -> mov(G0, dst); + // could be moved before branch, and annulate delay, + // but may add some unneeded work decoding null + sub(src, G6_heapbase, dst); + srlx(dst, LogMinObjAlignmentInBytes, dst); + bind(done); + } + } + + + void MacroAssembler::encode_heap_oop_not_null(Register r) { + assert (UseCompressedOops, "must be compressed"); + verify_oop(r); + sub(r, G6_heapbase, r); + srlx(r, LogMinObjAlignmentInBytes, r); + } + + void MacroAssembler::encode_heap_oop_not_null(Register src, Register dst) { + assert (UseCompressedOops, "must be compressed"); + verify_oop(src); + sub(src, G6_heapbase, dst); + srlx(dst, LogMinObjAlignmentInBytes, dst); + } + + // Same algorithm as oops.inline.hpp decode_heap_oop. + void MacroAssembler::decode_heap_oop(Register src, Register dst) { + assert (UseCompressedOops, "must be compressed"); + Label done; + sllx(src, LogMinObjAlignmentInBytes, dst); + bpr(rc_nz, true, Assembler::pt, dst, done); + delayed() -> add(dst, G6_heapbase, dst); // annuled if not taken + bind(done); + verify_oop(dst); + } + + void MacroAssembler::decode_heap_oop_not_null(Register r) { + // Do not add assert code to this unless you change vtableStubs_sparc.cpp + // pd_code_size_limit. + // Also do not verify_oop as this is called by verify_oop. + assert (UseCompressedOops, "must be compressed"); + sllx(r, LogMinObjAlignmentInBytes, r); + add(r, G6_heapbase, r); + } + + void MacroAssembler::decode_heap_oop_not_null(Register src, Register dst) { + // Do not add assert code to this unless you change vtableStubs_sparc.cpp + // pd_code_size_limit. + // Also do not verify_oop as this is called by verify_oop. + assert (UseCompressedOops, "must be compressed"); + sllx(src, LogMinObjAlignmentInBytes, dst); + add(dst, G6_heapbase, dst); + } + + void MacroAssembler::reinit_heapbase() { + if (UseCompressedOops) { + // call indirectly to solve generation ordering problem + Address base(G6_heapbase, (address)Universe::heap_base_addr()); + load_ptr_contents(base, G6_heapbase); + } + }