--- old/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp 2015-08-12 14:08:48.000000000 +0200 +++ new/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp 2015-08-12 14:08:48.000000000 +0200 @@ -347,6 +347,9 @@ __ b(_continuation); } +void C1ThreadLocalSafepoint::emit_code(LIR_Assembler* ce) { + ShouldNotReachHere(); +} ///////////////////////////////////////////////////////////////////////////// #if INCLUDE_ALL_GCS --- old/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp 2015-08-12 14:08:49.000000000 +0200 +++ new/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp 2015-08-12 14:08:49.000000000 +0200 @@ -486,7 +486,7 @@ // This is the fast version of java.lang.String.compare; it has not // OSR-entry and therefore, we generate a slow version for OSR's -void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, CodeEmitInfo* info) { +void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) { __ mov(r2, (address)__FUNCTION__); __ call_Unimplemented(); } @@ -538,7 +538,7 @@ __ bind(nope); } -void LIR_Assembler::return_op(LIR_Opr result) { +void LIR_Assembler::return_op(LIR_Opr result, C1ThreadLocalSafepoint *tls_stub) { assert(result->is_illegal() || !result->is_single_cpu() || result->as_register() == r0, "word returns are in r0,"); // Pop the stack before the safepoint code __ remove_frame(initial_frame_size_in_bytes()); @@ -547,7 +547,7 @@ __ ret(lr); } -int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) { +int LIR_Assembler::safepoint_poll(LIR_Opr tmp, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) { address polling_page(os::get_polling_page()); guarantee(info != NULL, "Shouldn't be NULL"); assert(os::is_poll_address(polling_page), "should be"); --- old/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp 2015-08-12 14:08:51.000000000 +0200 +++ new/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp 2015-08-12 14:08:50.000000000 +0200 @@ -452,6 +452,9 @@ __ delayed()->nop(); } +void C1ThreadLocalSafepoint::emit_code(LIR_Assembler* ce) { + ShouldNotReachHere(); +} /////////////////////////////////////////////////////////////////////////////////// #if INCLUDE_ALL_GCS --- old/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp 2015-08-12 14:08:52.000000000 +0200 +++ new/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp 2015-08-12 14:08:52.000000000 +0200 @@ -235,7 +235,7 @@ // Optimized Library calls // This is the fast version of java.lang.String.compare; it has not // OSR-entry and therefore, we generate a slow version for OSR's -void LIR_Assembler::emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, CodeEmitInfo* info) { +void LIR_Assembler::emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) { Register str0 = left->as_register(); Register str1 = right->as_register(); @@ -1564,7 +1564,7 @@ } -void LIR_Assembler::return_op(LIR_Opr result) { +void LIR_Assembler::return_op(LIR_Opr result, C1ThreadLocalSafepoint *tls_stub) { // the poll may need a register so just pick one that isn't the return register #if defined(TIERED) && !defined(_LP64) if (result->type_field() == LIR_OprDesc::long_type) { @@ -1588,7 +1588,7 @@ } -int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) { +int LIR_Assembler::safepoint_poll(LIR_Opr tmp, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) { __ set((intptr_t)os::get_polling_page(), tmp->as_register()); if (info != NULL) { add_debug_info_for_branch(info); --- old/src/cpu/x86/vm/assembler_x86.cpp 2015-08-12 14:08:53.000000000 +0200 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-08-12 14:08:53.000000000 +0200 @@ -3498,6 +3498,14 @@ emit_arith_b(0xF6, 0xC0, dst, imm8); } +void Assembler::testb(Address dst, int8_t imm8) { + InstructionMark im(this); + prefix(dst); + emit_int8((unsigned char)0xF6); + emit_operand(rax, dst, 1); + emit_int8((unsigned char)imm8); +} + void Assembler::testl(Register dst, int32_t imm32) { // not using emit_arith because test // doesn't support sign-extension of --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-08-12 14:08:55.000000000 +0200 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-08-12 14:08:54.000000000 +0200 @@ -1818,6 +1818,7 @@ void subss(XMMRegister dst, XMMRegister src); void testb(Register dst, int imm8); + void testb(Address dst, int8_t imm8); void testl(Register dst, int32_t imm32); void testl(Register dst, Register src); --- old/src/cpu/x86/vm/c1_CodeStubs_x86.cpp 2015-08-12 14:08:56.000000000 +0200 +++ new/src/cpu/x86/vm/c1_CodeStubs_x86.cpp 2015-08-12 14:08:56.000000000 +0200 @@ -518,6 +518,33 @@ __ jmp(_continuation); } +void C1ThreadLocalSafepoint::emit_code(LIR_Assembler* ce) { +#ifdef _LP64 + __ bind(_entry); + InternalAddress pc_addr(safepoint_pc()); + __ lea(rscratch1, pc_addr); + __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1); + + address stub; + + if (is_return()) { + assert(SharedRuntime::polling_page_return_handler_blob() != NULL, + "polling page return stub not created yet"); + stub = SharedRuntime::polling_page_return_handler_blob()->entry_point(); + } else { + assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL, + "polling page safepoint stub not created yet"); + stub = SharedRuntime::polling_page_safepoint_handler_blob()->entry_point(); + } + + RuntimeAddress callback_addr(stub); + + __ jump(callback_addr); +#else + ShouldNotReachHere(); +#endif /* _LP64 */ +} + ///////////////////////////////////////////////////////////////////////////// #if INCLUDE_ALL_GCS --- old/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp 2015-08-12 14:08:57.000000000 +0200 +++ new/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp 2015-08-12 14:08:57.000000000 +0200 @@ -26,6 +26,7 @@ #include "asm/macroAssembler.hpp" #include "asm/macroAssembler.inline.hpp" #include "c1/c1_Compilation.hpp" +#include "c1/c1_CodeStubs.hpp" #include "c1/c1_LIRAssembler.hpp" #include "c1/c1_MacroAssembler.hpp" #include "c1/c1_Runtime1.hpp" @@ -513,7 +514,7 @@ // This is the fast version of java.lang.String.compare; it has not // OSR-entry and therefore, we generate a slow version for OSR's -void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, CodeEmitInfo* info) { +void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) { __ movptr (rbx, rcx); // receiver is in rcx __ movptr (rax, arg1->as_register()); @@ -583,7 +584,7 @@ __ bind(noLoop); __ pop(rax); - return_op(LIR_OprFact::illegalOpr); + return_op(LIR_OprFact::illegalOpr, tls_stub); __ bind(haveResult); // leave instruction is going to discard the TOS value @@ -591,7 +592,7 @@ } -void LIR_Assembler::return_op(LIR_Opr result) { +void LIR_Assembler::return_op(LIR_Opr result, C1ThreadLocalSafepoint *code_stub) { assert(result->is_illegal() || !result->is_single_cpu() || result->as_register() == rax, "word returns are in rax,"); if (!result->is_illegal() && result->is_float_kind() && !result->is_xmm_register()) { assert(result->fpu() == 0, "result must already be on TOS"); @@ -604,33 +605,58 @@ // Note: we do not need to round double result; float result has the right precision // the poll sets the condition code, but no data registers - AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_return_type); - if (Assembler::is_polling_page_far()) { - __ lea(rscratch1, polling_page); - __ relocate(relocInfo::poll_return_type); - __ testl(rax, Address(rscratch1, 0)); + if (!ThreadLocalSafepoints) { + AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_return_type); + + if (Assembler::is_polling_page_far()) { + __ lea(rscratch1, polling_page); + __ relocate(relocInfo::poll_return_type); + __ testl(rax, Address(rscratch1, 0)); + } else { + __ testl(rax, polling_page); + } } else { - __ testl(rax, polling_page); +#ifdef _LP64 + code_stub->set_safepoint_pc(__ pc()); + __ relocate(relocInfo::poll_return_type); + __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 2); + __ jcc(Assembler::equal, *code_stub->entry()); +#else + ShouldNotReachHere(); +#endif } __ ret(0); } -int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) { +int LIR_Assembler::safepoint_poll(LIR_Opr tmp, C1ThreadLocalSafepoint *code_stub, CodeEmitInfo* info) { AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_type); guarantee(info != NULL, "Shouldn't be NULL"); int offset = __ offset(); - if (Assembler::is_polling_page_far()) { - __ lea(rscratch1, polling_page); - offset = __ offset(); - add_debug_info_for_branch(info); - __ relocate(relocInfo::poll_type); - __ testl(rax, Address(rscratch1, 0)); + if (!ThreadLocalSafepoints) { + if (Assembler::is_polling_page_far()) { + __ lea(rscratch1, polling_page); + offset = __ offset(); + add_debug_info_for_branch(info); + __ relocate(relocInfo::poll_type); + __ testl(rax, Address(rscratch1, 0)); + } else { + add_debug_info_for_branch(info); + __ testl(rax, polling_page); + } } else { +#ifdef _LP64 add_debug_info_for_branch(info); - __ testl(rax, polling_page); + code_stub->set_safepoint_pc(__ pc()); + __ relocate(relocInfo::poll_type); + __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 1); + __ jcc(Assembler::equal, *code_stub->entry()); +#else + ShouldNotReachHere(); +#endif } + return offset; } --- old/src/cpu/x86/vm/c1_Runtime1_x86.cpp 2015-08-12 14:08:58.000000000 +0200 +++ new/src/cpu/x86/vm/c1_Runtime1_x86.cpp 2015-08-12 14:08:58.000000000 +0200 @@ -1774,12 +1774,17 @@ NOT_LP64(__ get_thread(thread);) - __ cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val()); - __ jcc(Assembler::equal, done); - - __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad)); - __ cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val()); - __ jcc(Assembler::equal, done); + if (G1ElideMembar) { + __ cmpb(Address(card_addr, 0), (int)CardTableModRefBS::clean_card_val()); + __ jcc(Assembler::notEqual, done); + } else { + __ cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val()); + __ jcc(Assembler::equal, done); + + __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad)); + __ cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val()); + __ jcc(Assembler::equal, done); + } // storing region crossing non-NULL, card is clean. // dirty card and log. --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-08-12 14:08:59.000000000 +0200 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-08-12 14:08:59.000000000 +0200 @@ -4223,13 +4223,17 @@ movptr(cardtable, (intptr_t)ct->byte_map_base); addptr(card_addr, cardtable); - cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val()); - jcc(Assembler::equal, done); - - membar(Assembler::Membar_mask_bits(Assembler::StoreLoad)); - cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val()); - jcc(Assembler::equal, done); + if (G1ElideMembar) { + cmpb(Address(card_addr, 0), (int)CardTableModRefBS::clean_card_val()); + jcc(Assembler::notEqual, done); + } else { + cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val()); + jcc(Assembler::equal, done); + membar(Assembler::Membar_mask_bits(Assembler::StoreLoad)); + cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val()); + jcc(Assembler::equal, done); + } // storing a region crossing, non-NULL oop, card is clean. // dirty card and log. --- old/src/cpu/x86/vm/nativeInst_x86.hpp 2015-08-12 14:09:01.000000000 +0200 +++ new/src/cpu/x86/vm/nativeInst_x86.hpp 2015-08-12 14:09:01.000000000 +0200 @@ -526,7 +526,11 @@ inline bool NativeInstruction::is_cond_jump() { return (int_at(0) & 0xF0FF) == 0x800F /* long jump */ || (ubyte_at(0) & 0xF0) == 0x70; /* short jump */ } inline bool NativeInstruction::is_safepoint_poll() { + // TODO: Fix up parsing of safepoint poll code. Skipping now as it doesn't seem to be used for much other than asserts. #ifdef AMD64 + if (ThreadLocalSafepoints) { + return true; + } if (Assembler::is_polling_page_far()) { // two cases, depending on the choice of the base register in the address. if (((ubyte_at(0) & NativeTstRegMem::instruction_rex_prefix_mask) == NativeTstRegMem::instruction_rex_prefix && --- old/src/cpu/x86/vm/relocInfo_x86.cpp 2015-08-12 14:09:02.000000000 +0200 +++ new/src/cpu/x86/vm/relocInfo_x86.cpp 2015-08-12 14:09:02.000000000 +0200 @@ -180,7 +180,7 @@ void poll_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) { #ifdef _LP64 - if (!Assembler::is_polling_page_far()) { + if (!ThreadLocalSafepoints && !Assembler::is_polling_page_far()) { typedef Assembler::WhichOperand WhichOperand; WhichOperand which = (WhichOperand) format(); // This format is imm but it is really disp32 @@ -202,7 +202,7 @@ void poll_return_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) { #ifdef _LP64 - if (!Assembler::is_polling_page_far()) { + if (!ThreadLocalSafepoints && !Assembler::is_polling_page_far()) { typedef Assembler::WhichOperand WhichOperand; WhichOperand which = (WhichOperand) format(); // This format is imm but it is really disp32 --- old/src/cpu/x86/vm/x86_64.ad 2015-08-12 14:09:03.000000000 +0200 +++ new/src/cpu/x86/vm/x86_64.ad 2015-08-12 14:09:03.000000000 +0200 @@ -938,7 +938,11 @@ st->print_cr("popq rbp"); if (do_polling() && C->is_method_compilation()) { st->print("\t"); - if (Assembler::is_polling_page_far()) { + if (ThreadLocalSafepoints) { + st->print_cr("testb $1, [r15]\t" + "# Safepoint: poll for GC\n\t"); + st->print_cr("je #slow_safepoint_runtime"); + } else if (Assembler::is_polling_page_far()) { st->print_cr("movq rscratch1, #polling_page_address\n\t" "testl rax, [rscratch1]\t" "# Safepoint: poll for GC"); @@ -987,7 +991,14 @@ if (do_polling() && C->is_method_compilation()) { MacroAssembler _masm(&cbuf); AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_return_type); - if (Assembler::is_polling_page_far()) { + if (ThreadLocalSafepoints) { + InternalAddress safepoint_pc(__ pc()); + Label dummy_label; + Label &code_stub = &cbuf == C->code_buffer() ? C->tls_table()->add_safepoint(safepoint_pc, true) : dummy_label; + __ relocate(relocInfo::poll_return_type); + __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 2); + __ jcc(Assembler::equal, code_stub); + } else if (Assembler::is_polling_page_far()) { __ lea(rscratch1, polling_page); __ relocate(relocInfo::poll_return_type); __ testl(rax, Address(rscratch1, 0)); @@ -1005,7 +1016,7 @@ int MachEpilogNode::reloc() const { - return 2; // a large enough number + return 3; // a large enough number } const Pipeline* MachEpilogNode::pipeline() const @@ -11552,7 +11563,7 @@ // Safepoint Instructions instruct safePoint_poll(rFlagsReg cr) %{ - predicate(!Assembler::is_polling_page_far()); + predicate(!Assembler::is_polling_page_far() || ThreadLocalSafepoints); match(SafePoint); effect(KILL cr); @@ -11560,15 +11571,25 @@ "# Safepoint: poll for GC" %} ins_cost(125); ins_encode %{ - AddressLiteral addr(os::get_polling_page(), relocInfo::poll_type); - __ testl(rax, addr); + if (ThreadLocalSafepoints) { + Compile* C = ra_->C; + InternalAddress safepoint_pc(__ pc()); + Label dummy_label; + Label &code_stub = &cbuf == C->code_buffer() ? C->tls_table()->add_safepoint(safepoint_pc, false) : dummy_label; + __ relocate(relocInfo::poll_type); + __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 1); + __ jcc(Assembler::equal, code_stub); + } else { + AddressLiteral addr(os::get_polling_page(), relocInfo::poll_type); + __ testl(rax, addr); + } %} ins_pipe(ialu_reg_mem); %} instruct safePoint_poll_far(rFlagsReg cr, rRegP poll) %{ - predicate(Assembler::is_polling_page_far()); + predicate(Assembler::is_polling_page_far() && !ThreadLocalSafepoints); match(SafePoint poll); effect(KILL cr, USE poll); --- old/src/os/aix/vm/osThread_aix.cpp 2015-08-12 14:09:05.000000000 +0200 +++ new/src/os/aix/vm/osThread_aix.cpp 2015-08-12 14:09:05.000000000 +0200 @@ -53,3 +53,7 @@ void OSThread::pd_destroy() { delete _startThread_lock; } + +bool OSThread::is_online() { + return true; +} --- old/src/os/bsd/vm/osThread_bsd.cpp 2015-08-12 14:09:06.000000000 +0200 +++ new/src/os/bsd/vm/osThread_bsd.cpp 2015-08-12 14:09:06.000000000 +0200 @@ -51,3 +51,28 @@ void OSThread::pd_destroy() { delete _startThread_lock; } + +bool OSThread::is_online() { +#ifdef __APPLE__ + mach_msg_type_number_t thread_info_count = THREAD_BASIC_INFO_COUNT; + thread_basic_info_data_t thread_info_data; + + kern_return_t kr = thread_info( + _thread_id, + THREAD_BASIC_INFO, + reinterpret_cast(&thread_info_data), + &thread_info_count); + if (kr != KERN_SUCCESS) { + return false; + } + + if (thread_info_data.run_state != TH_STATE_RUNNING) return false; + + const bool swapped = (thread_info_data.flags & TH_FLAGS_SWAPPED); + const bool idle = (thread_info_data.flags & TH_FLAGS_IDLE); + + return !swapped && !idle; +#else + return true; +#endif +} --- old/src/os/linux/vm/osThread_linux.cpp 2015-08-12 14:09:07.000000000 +0200 +++ new/src/os/linux/vm/osThread_linux.cpp 2015-08-12 14:09:07.000000000 +0200 @@ -47,3 +47,7 @@ void OSThread::pd_destroy() { delete _startThread_lock; } + +bool OSThread::is_online() { + return true; +} --- old/src/os/solaris/vm/osThread_solaris.cpp 2015-08-12 14:09:09.000000000 +0200 +++ new/src/os/solaris/vm/osThread_solaris.cpp 2015-08-12 14:09:08.000000000 +0200 @@ -51,3 +51,8 @@ void OSThread::SR_handler(Thread* thread, ucontext_t* uc) { os::Solaris::SR_handler(thread, uc); } + +bool OSThread::is_online() { + // TODO: Solaris can do better: find out if a thread is ONPROC. + return true; +} --- old/src/os/windows/vm/osThread_windows.cpp 2015-08-12 14:09:10.000000000 +0200 +++ new/src/os/windows/vm/osThread_windows.cpp 2015-08-12 14:09:10.000000000 +0200 @@ -41,3 +41,7 @@ // free_thread. Should follow pattern of Linux/Solaris code here. void OSThread::pd_destroy() { } + +bool OSThread::is_online() { + return true; +} --- old/src/share/vm/c1/c1_CodeStubs.hpp 2015-08-12 14:09:11.000000000 +0200 +++ new/src/share/vm/c1/c1_CodeStubs.hpp 2015-08-12 14:09:11.000000000 +0200 @@ -535,6 +535,29 @@ #endif // PRODUCT }; +class C1ThreadLocalSafepoint: public CodeStub { + private: + address _safepoint_pc; + bool _is_return; + + public: + C1ThreadLocalSafepoint(bool is_return) : _is_return(is_return) { } + + bool is_return() { return _is_return; } + + address safepoint_pc() { return _safepoint_pc; } + void set_safepoint_pc(address pc) { _safepoint_pc = pc; } + + virtual void emit_code(LIR_Assembler* e); + virtual void visit(LIR_OpVisitState* visitor) { + // don't pass in the code emit info since it's processed in the fast path + visitor->do_slow_case(); + } +#ifndef PRODUCT + virtual void print_name(outputStream* out) const { out->print("C1ThreadLocalSafepoint"); } +#endif // PRODUCT +}; + ////////////////////////////////////////////////////////////////////////////////////////// #if INCLUDE_ALL_GCS --- old/src/share/vm/c1/c1_LIR.cpp 2015-08-12 14:09:12.000000000 +0200 +++ new/src/share/vm/c1/c1_LIR.cpp 2015-08-12 14:09:12.000000000 +0200 @@ -518,7 +518,6 @@ case lir_ffree: // input always valid, result and info always invalid case lir_push: // input always valid, result and info always invalid case lir_pop: // input always valid, result and info always invalid - case lir_return: // input always valid, result and info always invalid case lir_leal: // input and result always valid, info always invalid case lir_neg: // input and result always valid, info always invalid case lir_monaddr: // input and result always valid, info always invalid @@ -537,15 +536,31 @@ break; } + case lir_return: // input always valid, result and info always invalid + { + assert(op->as_Op1Safepoint() != NULL, "must be"); + LIR_Op1Safepoint* op_ret = (LIR_Op1Safepoint*)op; + + if (op_ret->_info) do_info(op_ret->_info); + if (op_ret->_opr->is_valid()) do_input(op_ret->_opr); + if (op_ret->_result->is_valid()) do_output(op_ret->_result); + + if (op_ret->tls_stub() != NULL) do_stub(op_ret->tls_stub()); + + break; + } + case lir_safepoint: { - assert(op->as_Op1() != NULL, "must be"); - LIR_Op1* op1 = (LIR_Op1*)op; + assert(op->as_Op1Safepoint() != NULL, "must be"); + LIR_Op1Safepoint* op1 = (LIR_Op1Safepoint*)op; assert(op1->_info != NULL, ""); do_info(op1->_info); if (op1->_opr->is_valid()) do_temp(op1->_opr); // safepoints on SPARC need temporary register assert(op1->_result->is_illegal(), "safepoint does not produce value"); + if (op1->tls_stub() != NULL) do_stub(op1->tls_stub()); + break; } @@ -1527,6 +1542,14 @@ append(new LIR_OpCompareAndSwap(lir_cas_int, addr, cmp_value, new_value, t1, t2, result)); } +// LIR_Op1Safepoint +LIR_Op1Safepoint::LIR_Op1Safepoint(LIR_Code code, LIR_Opr opr, CodeEmitInfo* info) + : LIR_Op1(code, opr, info) + , _tls_stub(NULL) { + if (ThreadLocalSafepoints) { + _tls_stub = new C1ThreadLocalSafepoint(code == lir_return); + } +} #ifdef PRODUCT --- old/src/share/vm/c1/c1_LIR.hpp 2015-08-12 14:09:13.000000000 +0200 +++ new/src/share/vm/c1/c1_LIR.hpp 2015-08-12 14:09:13.000000000 +0200 @@ -36,6 +36,7 @@ class CodeStub; class CodeStubList; class ArrayCopyStub; +class C1ThreadLocalSafepoint; class LIR_Op; class ciType; class ValueType; @@ -873,6 +874,7 @@ class LIR_OpLabel; class LIR_Op1; class LIR_OpBranch; +class LIR_Op1Safepoint; class LIR_OpConvert; class LIR_OpAllocObj; class LIR_OpRoundFP; @@ -1142,6 +1144,7 @@ virtual LIR_OpAllocObj* as_OpAllocObj() { return NULL; } virtual LIR_OpRoundFP* as_OpRoundFP() { return NULL; } virtual LIR_OpBranch* as_OpBranch() { return NULL; } + virtual LIR_Op1Safepoint* as_Op1Safepoint() { return NULL; } virtual LIR_OpRTCall* as_OpRTCall() { return NULL; } virtual LIR_OpConvert* as_OpConvert() { return NULL; } virtual LIR_Op0* as_Op0() { return NULL; } @@ -1468,6 +1471,19 @@ virtual void print_instr(outputStream* out) const PRODUCT_RETURN; }; +class LIR_Op1Safepoint: public LIR_Op1 { + friend class LIR_OpVisitState; + + private: + C1ThreadLocalSafepoint* _tls_stub; + + public: + LIR_Op1Safepoint(LIR_Code code, LIR_Opr opr, CodeEmitInfo* info); + + C1ThreadLocalSafepoint* tls_stub() const { return _tls_stub; } + + virtual LIR_Op1Safepoint* as_Op1Safepoint() { return this; } +}; class ConversionStub; @@ -2140,9 +2156,9 @@ void metadata2reg (Metadata* o, LIR_Opr reg) { assert(reg->type() == T_METADATA, "bad reg"); append(new LIR_Op1(lir_move, LIR_OprFact::metadataConst(o), reg)); } void klass2reg_patch(Metadata* o, LIR_Opr reg, CodeEmitInfo* info); - void return_op(LIR_Opr result) { append(new LIR_Op1(lir_return, result)); } + void return_op(LIR_Opr result) { append(new LIR_Op1Safepoint(lir_return, result, NULL)); } - void safepoint(LIR_Opr tmp, CodeEmitInfo* info) { append(new LIR_Op1(lir_safepoint, tmp, info)); } + void safepoint(LIR_Opr tmp, CodeEmitInfo* info) { append(new LIR_Op1Safepoint(lir_safepoint, tmp, info)); } #ifdef PPC void convert(Bytecodes::Code code, LIR_Opr left, LIR_Opr dst, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_OpConvert(code, left, dst, NULL, tmp1, tmp2)); } --- old/src/share/vm/c1/c1_LIRAssembler.cpp 2015-08-12 14:09:15.000000000 +0200 +++ new/src/share/vm/c1/c1_LIRAssembler.cpp 2015-08-12 14:09:14.000000000 +0200 @@ -510,16 +510,24 @@ break; } - case lir_return: - return_op(op->in_opr()); + case lir_return: { + assert(op->as_Op1Safepoint() != NULL, "sanity"); + LIR_Op1Safepoint *ret_op = (LIR_Op1Safepoint*)op; + return_op(ret_op->in_opr(), ret_op->tls_stub()); + if (ret_op->tls_stub()) append_code_stub(ret_op->tls_stub()); break; + } - case lir_safepoint: + case lir_safepoint: { + assert(op->as_Op1Safepoint() != NULL, "sanity"); + LIR_Op1Safepoint *sp_op = (LIR_Op1Safepoint*)op; if (compilation()->debug_info_recorder()->last_pc_offset() == code_offset()) { _masm->nop(); } - safepoint_poll(op->in_opr(), op->info()); + safepoint_poll(op->in_opr(), sp_op->tls_stub(), op->info()); + if (sp_op->tls_stub()) append_code_stub(sp_op->tls_stub()); break; + } case lir_fxch: fxch(op->in_opr()->as_jint()); --- old/src/share/vm/c1/c1_LIRAssembler.hpp 2015-08-12 14:09:16.000000000 +0200 +++ new/src/share/vm/c1/c1_LIRAssembler.hpp 2015-08-12 14:09:16.000000000 +0200 @@ -161,12 +161,12 @@ // particular sparc uses this for delay slot filling. void peephole(LIR_List* list); - void emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, CodeEmitInfo* info); + void emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info); - void return_op(LIR_Opr result); + void return_op(LIR_Opr result, C1ThreadLocalSafepoint *code_stub); // returns offset of poll instruction - int safepoint_poll(LIR_Opr result, CodeEmitInfo* info); + int safepoint_poll(LIR_Opr result, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info); void const2reg (LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info); void const2stack(LIR_Opr src, LIR_Opr dest); --- old/src/share/vm/c1/c1_LinearScan.cpp 2015-08-12 14:09:17.000000000 +0200 +++ new/src/share/vm/c1/c1_LinearScan.cpp 2015-08-12 14:09:17.000000000 +0200 @@ -6304,7 +6304,7 @@ if (pred_last_branch->block() == block && pred_last_branch->cond() == lir_cond_always && pred_last_branch->info() == NULL) { // replace the jump to a return with a direct return // Note: currently the edge between the blocks is not deleted - pred_instructions->at_put(pred_instructions->length() - 1, new LIR_Op1(lir_return, return_opr)); + pred_instructions->at_put(pred_instructions->length() - 1, new LIR_Op1Safepoint(lir_return, return_opr, NULL)); #ifdef ASSERT return_converted.set_bit(pred->block_id()); #endif --- old/src/share/vm/gc/g1/concurrentG1Refine.cpp 2015-08-12 14:09:18.000000000 +0200 +++ new/src/share/vm/gc/g1/concurrentG1Refine.cpp 2015-08-12 14:09:18.000000000 +0200 @@ -29,7 +29,7 @@ #include "gc/g1/g1HotCardCache.hpp" #include "runtime/java.hpp" -ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure) : +ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h) : _threads(NULL), _n_threads(0), _hot_card_cache(g1h) { @@ -61,7 +61,7 @@ ConcurrentG1RefineThread *next = NULL; for (uint i = _n_threads - 1; i != UINT_MAX; i--) { - ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, refine_closure, worker_id_offset, i); + ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i); assert(t != NULL, "Conc refine should have been created"); if (t->osthread() == NULL) { vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread"); --- old/src/share/vm/gc/g1/concurrentG1Refine.hpp 2015-08-12 14:09:19.000000000 +0200 +++ new/src/share/vm/gc/g1/concurrentG1Refine.hpp 2015-08-12 14:09:19.000000000 +0200 @@ -72,7 +72,7 @@ void reset_threshold_step(); public: - ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure); + ConcurrentG1Refine(G1CollectedHeap* g1h); ~ConcurrentG1Refine(); void init(G1RegionToSpaceMapper* card_counts_storage); --- old/src/share/vm/gc/g1/concurrentG1RefineThread.cpp 2015-08-12 14:09:21.000000000 +0200 +++ new/src/share/vm/gc/g1/concurrentG1RefineThread.cpp 2015-08-12 14:09:20.000000000 +0200 @@ -34,10 +34,8 @@ ConcurrentG1RefineThread:: ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread *next, - CardTableEntryClosure* refine_closure, uint worker_id_offset, uint worker_id) : ConcurrentGCThread(), - _refine_closure(refine_closure), _worker_id_offset(worker_id_offset), _worker_id(worker_id), _active(false), @@ -190,6 +188,7 @@ { SuspendibleThreadSetJoiner sts_join; + BufferedRefineCardTableEntryClosure cl; do { int curr_buffer_num = (int)dcqs.completed_buffers_num(); @@ -203,6 +202,7 @@ // If the number of the buffer has fallen below our threshold // we should deactivate. The predecessor will reactivate this // thread should the number of the buffers cross the threshold again. + cl.flush_buffer(); deactivate(); break; } @@ -211,7 +211,9 @@ if (_next != NULL && !_next->is_active() && curr_buffer_num > _next->_threshold) { _next->activate(); } - } while (dcqs.apply_closure_to_completed_buffer(_refine_closure, _worker_id + _worker_id_offset, cg1r()->green_zone())); + } while (dcqs.apply_closure_to_completed_buffer(&cl, _worker_id + _worker_id_offset, cg1r()->green_zone())); + + cl.flush_buffer(); // We can exit the loop above while being active if there was a yield request. if (is_active()) { @@ -251,4 +253,3 @@ gclog_or_tty->print_cr("G1-Refine-stop"); } } - --- old/src/share/vm/gc/g1/concurrentG1RefineThread.hpp 2015-08-12 14:09:22.000000000 +0200 +++ new/src/share/vm/gc/g1/concurrentG1RefineThread.hpp 2015-08-12 14:09:21.000000000 +0200 @@ -50,9 +50,6 @@ Monitor* _monitor; ConcurrentG1Refine* _cg1r; - // The closure applied to completed log buffers. - CardTableEntryClosure* _refine_closure; - int _thread_threshold_step; // This thread activation threshold int _threshold; @@ -72,7 +69,6 @@ virtual void run(); // Constructor ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread* next, - CardTableEntryClosure* refine_closure, uint worker_id_offset, uint worker_id); void initialize(); --- old/src/share/vm/gc/g1/dirtyCardQueue.cpp 2015-08-12 14:09:23.000000000 +0200 +++ new/src/share/vm/gc/g1/dirtyCardQueue.cpp 2015-08-12 14:09:23.000000000 +0200 @@ -70,7 +70,6 @@ DirtyCardQueueSet::DirtyCardQueueSet(bool notify_when_complete) : PtrQueueSet(notify_when_complete), - _mut_process_closure(NULL), _shared_dirty_card_queue(this, true /*perm*/), _free_ids(NULL), _processed_buffers_mut(0), _processed_buffers_rs_thread(0) @@ -83,11 +82,11 @@ return (uint)os::processor_count(); } -void DirtyCardQueueSet::initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock, +void DirtyCardQueueSet::initialize(bool should_do_processing, Monitor* cbl_mon, Mutex* fl_lock, int process_completed_threshold, int max_completed_queue, Mutex* lock, PtrQueueSet* fl_owner) { - _mut_process_closure = cl; + _should_do_processing = should_do_processing; PtrQueueSet::initialize(cbl_mon, fl_lock, process_completed_threshold, max_completed_queue, fl_owner); set_buffer_size(G1UpdateBufferSize); @@ -141,8 +140,10 @@ bool b = false; if (worker_i != UINT_MAX) { - b = DirtyCardQueue::apply_closure_to_buffer(_mut_process_closure, buf, 0, + BufferedRefineCardTableEntryClosure cl; + b = DirtyCardQueue::apply_closure_to_buffer(_should_do_processing ? &cl : NULL, buf, 0, _sz, true, worker_i); + cl.flush_buffer(); if (b) Atomic::inc(&_processed_buffers_mut); // If we had not claimed an id before entering the method --- old/src/share/vm/gc/g1/dirtyCardQueue.hpp 2015-08-12 14:09:24.000000000 +0200 +++ new/src/share/vm/gc/g1/dirtyCardQueue.hpp 2015-08-12 14:09:24.000000000 +0200 @@ -80,9 +80,6 @@ class DirtyCardQueueSet: public PtrQueueSet { - // The closure used in mut_process_buffer(). - CardTableEntryClosure* _mut_process_closure; - DirtyCardQueue _shared_dirty_card_queue; // Override. @@ -98,10 +95,12 @@ // Current buffer node used for parallel iteration. BufferNode* volatile _cur_par_buffer_node; + + bool _should_do_processing; public: DirtyCardQueueSet(bool notify_when_complete = true); - void initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock, + void initialize(bool should_do_processing, Monitor* cbl_mon, Mutex* fl_lock, int process_completed_threshold, int max_completed_queue, Mutex* lock, PtrQueueSet* fl_owner = NULL); --- old/src/share/vm/gc/g1/g1CollectedHeap.cpp 2015-08-12 14:09:25.000000000 +0200 +++ new/src/share/vm/gc/g1/g1CollectedHeap.cpp 2015-08-12 14:09:25.000000000 +0200 @@ -65,6 +65,7 @@ #include "memory/iterator.hpp" #include "oops/oop.inline.hpp" #include "runtime/atomic.inline.hpp" +#include "runtime/globalSynchronizer.hpp" #include "runtime/orderAccess.inline.hpp" #include "runtime/vmThread.hpp" #include "utilities/globalDefinitions.hpp" @@ -89,29 +90,289 @@ // is done by clients of this interface.) // Local to this file. +bool RefineCardTableEntryClosure::do_card_ptr(jbyte* card_ptr, uint worker_i) { + bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false); + // This path is executed by the concurrent refine or mutator threads, + // concurrently, and so we do not care if card_ptr contains references + // that point into the collection set. + assert(!oops_into_cset, "should be"); -class RefineCardTableEntryClosure: public CardTableEntryClosure { - bool _concurrent; -public: - RefineCardTableEntryClosure() : _concurrent(true) { } + // return false if caller should yield + return !(G1CollectedHeap::heap()->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield()); +} - bool do_card_ptr(jbyte* card_ptr, uint worker_i) { - bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false); - // This path is executed by the concurrent refine or mutator threads, - // concurrently, and so we do not care if card_ptr contains references - // that point into the collection set. - assert(!oops_into_cset, "should be"); +CardBuffer::CardBuffer() + : _next(NULL) { + int size = BufferedRefineCardTableEntryClosure::buffer_size(); + _card_buffer = NEW_C_HEAP_ARRAY(jbyte*, size, mtGC); + _mr_buffer = NEW_C_HEAP_ARRAY(MemRegion, size, mtGC); + _gs = new SynchronizerObj(); + _misses = 0; +} - if (_concurrent && SuspendibleThreadSet::should_yield()) { - // Caller will actually yield. - return false; +CardBuffer::~CardBuffer() { + FREE_C_HEAP_ARRAY(jbyte*, _card_buffer); + FREE_C_HEAP_ARRAY(MemRegion, _mr_buffer); + delete _gs; +} + +BufferedRefineCardTableEntryClosure::BufferedRefineCardTableEntryClosure() + : _index(0), _g1h(G1CollectedHeap::heap()), _head_buffer(NULL), _tail_buffer(NULL), + _current_buffer(NULL), _async_buffers(0) { +} + +BufferedRefineCardTableEntryClosure::~BufferedRefineCardTableEntryClosure() { + assert(_index == 0, "must flush refine card buffer"); + assert(_head_buffer == NULL && _tail_buffer == NULL, "must flush all async cards first"); + assert(_async_buffers == 0, "must flush all async cards first"); + if (_current_buffer) delete _current_buffer; +} + +bool BufferedRefineCardTableEntryClosure::do_card_ptr(jbyte *card_ptr, uint worker_i) { + _worker_i = worker_i; + if (_index == buffer_size()) soft_flush(); + if (_current_buffer == NULL) _current_buffer = new CardBuffer(); + _current_buffer->_card_buffer[_index++] = card_ptr; + + bool should_yield = _g1h->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield(); + if (should_yield) flush_buffer(); + + // return false if caller should yield + return !should_yield; +} + +void BufferedRefineCardTableEntryClosure::soft_flush() { + general_flush(false); +} + +// Procedures used to sort and join G1 cards during refinement +static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right); +static int partition(jbyte **card_array, MemRegion *region_array, int left, int right); +static int join_cards(jbyte **card_array, MemRegion *region_array, int length); + +static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right) { + int middle; + if (left < right) + { + middle = partition(card_array, region_array, left, right); + quick_sort(card_array, region_array, left, middle); + quick_sort(card_array, region_array, middle + 1, right); + } +} + +static int partition(jbyte **card_array, MemRegion *region_array, int left, int right) { + jbyte *card = card_array[left]; + int i = left; + int j; + + for (j = left + 1; j < right; j++) + { + if (card_array[j] <= card) + { + i = i + 1; + swap(card_array[i], card_array[j]); + swap(region_array[i], region_array[j]); + } + } + + swap(card_array[i], card_array[left]); + swap(region_array[i], region_array[left]); + return i; +} + +static int join_cards(jbyte **card_array, MemRegion *region_array, int length) { + G1CollectedHeap *g1h = G1CollectedHeap::heap(); + jbyte *prev_card = NULL; + HeapRegion *prev_hr = NULL; + int insert_head = 0; + for (int i = 0; i < length; i++) { + jbyte *card = card_array[i]; + + if (*card == CardTableModRefBS::clean_card_val()) { + HeapRegion *hr = g1h->heap_region_containing_raw(region_array[i].start()); + if (card == prev_card + 1 && hr == prev_hr) { + MemRegion insert_region = region_array[insert_head - 1]; + region_array[insert_head - 1] = MemRegion(insert_region.start(), region_array[i].end()); + } else { + card_array[insert_head] = card; + region_array[insert_head] = region_array[i]; + insert_head++; + } + prev_hr = hr; + } + + prev_card = card; + } + + return insert_head; +} + +int BufferedRefineCardTableEntryClosure::buffer_size() { + return (int)G1UpdateBufferSize; +} + +void BufferedRefineCardTableEntryClosure::flush_buffer() { + general_flush(true); +} + +// Returns true if it needs post sync +bool BufferedRefineCardTableEntryClosure::pre_sync(CardBuffer *buffer, bool hard) { + // 1. Clean all cards in the batch. + G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set(); + int needs_processing = 0; + + jbyte **const card_buffer = buffer->_card_buffer; + MemRegion *const mr_buffer = buffer->_mr_buffer; + const int length = buffer->_length; + + for (int i = 0; i < length; i++) { + if (g1rs->clean_card(card_buffer[i], _worker_i, mr_buffer[i])) { + card_buffer[needs_processing] = card_buffer[i]; + mr_buffer[needs_processing] = mr_buffer[i]; + needs_processing++; + } + } + buffer->_length = needs_processing; + + if (needs_processing == 0) { + if (hard) { + // If we are forced to finish scanning, we must serialize stores anyway. + OrderAccess::storeload(); + if (G1ElideMembar) { + buffer->_gs->start_synchronizing(); + } + } + return false; + } + + OrderAccess::storeload(); + if (G1ElideMembar) { + buffer->_gs->start_synchronizing(); + } + + // 2. Sort the cards + quick_sort(buffer->_card_buffer, buffer->_mr_buffer, 0, buffer->_length); + + return true; +} + +bool BufferedRefineCardTableEntryClosure::sync(CardBuffer *buffer, bool hard) { + if (!G1ElideMembar) return true; + + bool success = buffer->_gs->try_synchronize(); + if (hard) { + if (!success) { + buffer->_gs->maximize_urgency(); + buffer->_gs->synchronize(); } - // Otherwise, we finished successfully; return true. return true; + } else { + return success; } +} - void set_concurrent(bool b) { _concurrent = b; } -}; +void BufferedRefineCardTableEntryClosure::post_sync(CardBuffer *buffer) { + const int length = buffer->_length; + + const int card_batch_size = 16; + jbyte **current_card = buffer->_card_buffer; + MemRegion *current_region = buffer->_mr_buffer; + + const uintx interval = PrefetchScanIntervalInBytes * 2; + + G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set(); + + // 3. Batch 16 cards at a time + + for (int j = 0; j < length; j += card_batch_size) { + // 4. Join consecutive cards together and prefetch next card + int batch = MIN2((length - j), card_batch_size); + batch = join_cards(current_card, current_region, batch); + + jbyte dirty_card_val = CardTableModRefBS::dirty_card_val(); + jbyte *end_card; + HeapWord *end_prefetch; + + if (j + card_batch_size < length) { + end_prefetch = current_region[card_batch_size].start(); + end_card = current_card[card_batch_size]; + } else { + end_card = &dirty_card_val; + } + + MemRegion *region_end = current_region + batch; + jbyte** batch_card; + MemRegion* batch_region; + + for (batch_card = current_card, batch_region = current_region; batch_region != region_end; batch_card++) { + jbyte *card = *batch_card; + MemRegion mr = *batch_region; + MemRegion *next_region = batch_region + 1; + + if (next_region != region_end) { + MemRegion next_region_val = *next_region; + // Prefetch interval in batch + Prefetch::read(next_region_val.start(), next_region_val.byte_size()); + } else if (*end_card == CardTableModRefBS::clean_card_val()) { + // Prefetch broken interval to next batch + Prefetch::read(end_prefetch, interval); + } + + g1rs->refine_card_buffered(card, _worker_i, /*check_for_cset_refs*/ false, mr); + + batch_region = next_region; + } + + current_region += card_batch_size; + current_card += card_batch_size; + } +} + +void BufferedRefineCardTableEntryClosure::general_flush(bool hard) { + if (_index == 0) { + assert(hard, "invariant"); + if (_async_buffers == 0) return; + } + + // 1. Start asynchronous synchronization for the current buffer + if (_current_buffer == NULL) _current_buffer = new CardBuffer(); + _current_buffer->_length = _index; + if (pre_sync(_current_buffer, hard) || hard) { + // append async buffer + CardBuffer *tail = _tail_buffer; + if (tail != NULL) tail->_next = _current_buffer; + _tail_buffer = _current_buffer; + if (_head_buffer == NULL) _head_buffer = _current_buffer; + if (hard) sync(_current_buffer, hard); + _current_buffer = NULL; + _async_buffers++; + } + + _index = 0; + + // 2. Process old batches that have been cleaned but couldn't synchronize (async completion) + CardBuffer *current = _head_buffer; + bool check_sync = true; + while (current != NULL) { + if (hard || sync(current, hard)) { + post_sync(current); + CardBuffer *next = current->_next; + _head_buffer = next; + if (next == NULL) _tail_buffer = NULL; + delete current; + current = next; + _async_buffers--; + } else { + current->_misses++; + if (_async_buffers > 4 && current->_misses > 2 + || _async_buffers > 8 && current->_misses > 4 + || _async_buffers > 16 && current->_misses > 6) { + current->_gs->increase_urgency(); + } + break; + } + } +} class RedirtyLoggedCardTableEntryClosure : public CardTableEntryClosure { @@ -1919,7 +2180,7 @@ _bot_shared(NULL), _cg1r(NULL), _g1mm(NULL), - _refine_cte_cl(NULL), + _refine_cte_cl_concurrency(true), _secondary_free_list("Secondary Free List", new SecondaryFreeRegionListMtSafeChecker()), _old_set("Old Set", false /* humongous */, new OldRegionSetMtSafeChecker()), _humongous_set("Master Humongous Set", true /* humongous */, new HumongousRegionSetMtSafeChecker()), @@ -2032,9 +2293,7 @@ Universe::check_alignment(max_byte_size, HeapRegion::GrainBytes, "g1 heap"); Universe::check_alignment(max_byte_size, heap_alignment, "g1 heap"); - _refine_cte_cl = new RefineCardTableEntryClosure(); - - _cg1r = new ConcurrentG1Refine(this, _refine_cte_cl); + _cg1r = new ConcurrentG1Refine(this); // Reserve the maximum. @@ -2158,14 +2417,14 @@ G1SATBProcessCompletedThreshold, Shared_SATB_Q_lock); - JavaThread::dirty_card_queue_set().initialize(_refine_cte_cl, + JavaThread::dirty_card_queue_set().initialize(true, DirtyCardQ_CBL_mon, DirtyCardQ_FL_lock, concurrent_g1_refine()->yellow_zone(), concurrent_g1_refine()->red_zone(), Shared_DirtyCardQ_lock); - dirty_card_queue_set().initialize(NULL, // Should never be called by the Java code + dirty_card_queue_set().initialize(false, // Should never be called by the Java code DirtyCardQ_CBL_mon, DirtyCardQ_FL_lock, -1, // never trigger processing @@ -2175,7 +2434,7 @@ // Initialize the card queue set used to hold cards containing // references into the collection set. - _into_cset_dirty_card_queue_set.initialize(NULL, // Should never be called by the Java code + _into_cset_dirty_card_queue_set.initialize(false, // Should never be called by the Java code DirtyCardQ_CBL_mon, DirtyCardQ_FL_lock, -1, // never trigger processing @@ -6381,7 +6640,11 @@ } void G1CollectedHeap::set_refine_cte_cl_concurrency(bool concurrent) { - _refine_cte_cl->set_concurrent(concurrent); + _refine_cte_cl_concurrency = concurrent; +} + +bool G1CollectedHeap::refine_cte_cl_concurrency() { + return _refine_cte_cl_concurrency; } bool G1CollectedHeap::is_in_closed_subset(const void* p) const { --- old/src/share/vm/gc/g1/g1CollectedHeap.hpp 2015-08-12 14:09:27.000000000 +0200 +++ new/src/share/vm/gc/g1/g1CollectedHeap.hpp 2015-08-12 14:09:26.000000000 +0200 @@ -57,6 +57,7 @@ class OopsInHeapRegionClosure; class G1KlassScanClosure; class G1ParScanThreadState; +class GlobalSynchronizer; class ObjectClosure; class SpaceClosure; class CompactibleSpaceClosure; @@ -169,7 +170,51 @@ bool do_object_b(oop p); }; -class RefineCardTableEntryClosure; +class RefineCardTableEntryClosure: public CardTableEntryClosure { +public: + RefineCardTableEntryClosure() { } + bool do_card_ptr(jbyte* card_ptr, uint worker_i); +}; + +class CardBuffer : public CHeapObj { +public: + CardBuffer *_next; + GlobalSynchronizer *_gs; + jbyte **_card_buffer; + MemRegion *_mr_buffer; + int _length; + + int _misses; + + CardBuffer(); + virtual ~CardBuffer(); +}; + +class BufferedRefineCardTableEntryClosure: public CardTableEntryClosure { + CardBuffer *_head_buffer; + CardBuffer *_tail_buffer; + CardBuffer *_current_buffer; + + int _index; + int _async_buffers; + + uint _worker_i; + G1CollectedHeap *const _g1h; + + bool pre_sync(CardBuffer *buffer, bool hard); + bool sync(CardBuffer *buffer, bool hard); + void post_sync(CardBuffer *buffer); + + void general_flush(bool hard); + void soft_flush(); +public: + BufferedRefineCardTableEntryClosure(); + ~BufferedRefineCardTableEntryClosure(); + static int buffer_size(); + bool do_card_ptr(jbyte *card_ptr, uint worker_i); + void flush_buffer(); +}; + class G1RegionMappingChangedListener : public G1MappingChangedListener { private: @@ -831,8 +876,7 @@ // concurrently after the collection. DirtyCardQueueSet _dirty_card_queue_set; - // The closure used to refine a single card. - RefineCardTableEntryClosure* _refine_cte_cl; + bool _refine_cte_cl_concurrency; // A DirtyCardQueueSet that is used to hold cards that contain // references into the current collection set. This is used to @@ -1020,6 +1064,7 @@ public: + bool refine_cte_cl_concurrency(); void set_refine_cte_cl_concurrency(bool concurrent); RefToScanQueue *task_queue(uint i) const; --- old/src/share/vm/gc/g1/g1RemSet.cpp 2015-08-12 14:09:28.000000000 +0200 +++ new/src/share/vm/gc/g1/g1RemSet.cpp 2015-08-12 14:09:28.000000000 +0200 @@ -227,6 +227,186 @@ size_t cards_looked_up() { return _cards;} }; +bool G1RemSet::clean_card(jbyte* &card_ptr, + uint worker_i, + MemRegion &dirty_region) { + assert(_g1->is_in_exact(_ct_bs->addr_for(card_ptr)), + err_msg("Card at " PTR_FORMAT " index " SIZE_FORMAT " representing heap at " PTR_FORMAT " (%u) must be in committed heap", + p2i(card_ptr), + _ct_bs->index_for(_ct_bs->addr_for(card_ptr)), + p2i(_ct_bs->addr_for(card_ptr)), + _g1->addr_to_region(_ct_bs->addr_for(card_ptr)))); + + // If the card is no longer dirty, nothing to do. + if (*card_ptr != CardTableModRefBS::dirty_card_val()) { + // No need to return that this card contains refs that point + // into the collection set. + return false; + } + + // Construct the region representing the card. + HeapWord* start = _ct_bs->addr_for(card_ptr); + // And find the region containing it. + HeapRegion* r = _g1->heap_region_containing(start); + + // Why do we have to check here whether a card is on a young region, + // given that we dirty young regions and, as a result, the + // post-barrier is supposed to filter them out and never to enqueue + // them? When we allocate a new region as the "allocation region" we + // actually dirty its cards after we release the lock, since card + // dirtying while holding the lock was a performance bottleneck. So, + // as a result, it is possible for other threads to actually + // allocate objects in the region (after the acquire the lock) + // before all the cards on the region are dirtied. This is unlikely, + // and it doesn't happen often, but it can happen. So, the extra + // check below filters out those cards. + if (r->is_young()) { + return false; + } + + // While we are processing RSet buffers during the collection, we + // actually don't want to scan any cards on the collection set, + // since we don't want to update remembered sets with entries that + // point into the collection set, given that live objects from the + // collection set are about to move and such entries will be stale + // very soon. This change also deals with a reliability issue which + // involves scanning a card in the collection set and coming across + // an array that was being chunked and looking malformed. Note, + // however, that if evacuation fails, we have to scan any objects + // that were not moved and create any missing entries. + if (r->in_collection_set()) { + return false; + } + + // The result from the hot card cache insert call is either: + // * pointer to the current card + // (implying that the current card is not 'hot'), + // * null + // (meaning we had inserted the card ptr into the "hot" card cache, + // which had some headroom), + // * a pointer to a "hot" card that was evicted from the "hot" cache. + // + + G1HotCardCache* hot_card_cache = _cg1r->hot_card_cache(); + if (hot_card_cache->use_cache()) { + assert(!SafepointSynchronize::is_at_safepoint(), "sanity"); + + card_ptr = hot_card_cache->insert(card_ptr); + if (card_ptr == NULL) { + // There was no eviction. Nothing to do. + return false; + } + + start = _ct_bs->addr_for(card_ptr); + r = _g1->heap_region_containing(start); + + // Checking whether the region we got back from the cache + // is young here is inappropriate. The region could have been + // freed, reallocated and tagged as young while in the cache. + // Hence we could see its young type change at any time. + } + + // Don't use addr_for(card_ptr + 1) which can ask for + // a card beyond the heap. This is not safe without a perm + // gen at the upper end of the heap. + HeapWord* end = start + CardTableModRefBS::card_size_in_words; + dirty_region = MemRegion(start, end); + +#if CARD_REPEAT_HISTO + init_ct_freq_table(_g1->max_capacity()); + ct_freq_note_card(_ct_bs->index_for(start)); +#endif + + return r->clean_card(dirty_region, /*filter young*/ true, card_ptr); +} + +bool G1RemSet::refine_card_buffered(jbyte* card_ptr, + uint worker_i, + bool check_for_refs_into_cset, + MemRegion dirty_region) { + // And find the region containing it. + HeapRegion* r = _g1->heap_region_containing(dirty_region.start()); + + G1ParPushHeapRSClosure* oops_in_heap_closure = NULL; + if (check_for_refs_into_cset) { + // ConcurrentG1RefineThreads have worker numbers larger than what + // _cset_rs_update_cl[] is set up to handle. But those threads should + // only be active outside of a collection which means that when they + // reach here they should have check_for_refs_into_cset == false. + assert((size_t)worker_i < n_workers(), "index of worker larger than _cset_rs_update_cl[].length"); + oops_in_heap_closure = _cset_rs_update_cl[worker_i]; + } + + G1UpdateRSOrPushRefOopClosure update_rs_oop_cl(_g1, + _g1->g1_rem_set(), + oops_in_heap_closure, + check_for_refs_into_cset, + worker_i); + update_rs_oop_cl.set_from(r); + + G1TriggerClosure trigger_cl; + FilterIntoCSClosure into_cs_cl(NULL, _g1, &trigger_cl); + G1InvokeIfNotTriggeredClosure invoke_cl(&trigger_cl, &into_cs_cl); + G1Mux2Closure mux(&invoke_cl, &update_rs_oop_cl); + + FilterOutOfRegionClosure filter_then_update_rs_oop_cl(r, + (check_for_refs_into_cset ? + (OopClosure*)&mux : + (OopClosure*)&update_rs_oop_cl)); + + // The region for the current card may be a young region. The + // current card may have been a card that was evicted from the + // card cache. When the card was inserted into the cache, we had + // determined that its region was non-young. While in the cache, + // the region may have been freed during a cleanup pause, reallocated + // and tagged as young. + // + // We wish to filter out cards for such a region but the current + // thread, if we're running concurrently, may "see" the young type + // change at any time (so an earlier "is_young" check may pass or + // fail arbitrarily). We tell the iteration code to perform this + // filtering when it has been determined that there has been an actual + // allocation in this region and making it safe to check the young type. + bool filter_young = true; + + HeapWord* stop_point = + r->process_oops_on_card(dirty_region, &filter_then_update_rs_oop_cl, card_ptr); + + // If stop_point is non-null, then we encountered an unallocated region + // (perhaps the unfilled portion of a TLAB.) For now, we'll dirty the + // card and re-enqueue: if we put off the card until a GC pause, then the + // unallocated portion will be filled in. Alternatively, we might try + // the full complexity of the technique used in "regular" precleaning. + if (stop_point != NULL) { + // The card might have gotten re-dirtied and re-enqueued while we + // worked. (In fact, it's pretty likely.) + card_ptr = G1CollectedHeap::heap()->g1_barrier_set()->byte_for(stop_point); + + if (*card_ptr != CardTableModRefBS::dirty_card_val()) { + *card_ptr = CardTableModRefBS::dirty_card_val(); + MutexLockerEx x(Shared_DirtyCardQ_lock, + Mutex::_no_safepoint_check_flag); + DirtyCardQueue* sdcq = + JavaThread::dirty_card_queue_set().shared_dirty_card_queue(); + sdcq->enqueue(card_ptr); + } + } else { + _conc_refine_cards++; + } + + // This gets set to true if the card being refined has + // references that point into the collection set. + bool has_refs_into_cset = trigger_cl.triggered(); + + // We should only be detecting that the card contains references + // that point into the collection set if the current thread is + // a GC worker thread. + assert(!has_refs_into_cset || SafepointSynchronize::is_at_safepoint(), + "invalid result at non safepoint"); + + return has_refs_into_cset; +} + void G1RemSet::scanRS(G1ParPushHeapRSClosure* oc, CodeBlobClosure* code_root_cl, uint worker_i) { --- old/src/share/vm/gc/g1/g1RemSet.hpp 2015-08-12 14:09:29.000000000 +0200 +++ new/src/share/vm/gc/g1/g1RemSet.hpp 2015-08-12 14:09:29.000000000 +0200 @@ -106,6 +106,15 @@ void prepare_for_oops_into_collection_set_do(); void cleanup_after_oops_into_collection_set_do(); + bool clean_card(jbyte* &card_ptr, + uint worker_i, + MemRegion &dirty_region); + bool refine_card_buffered(jbyte* card_ptr, + uint worker_i, + bool check_for_refs_into_cset, + MemRegion dirty_region); + + void scanRS(G1ParPushHeapRSClosure* oc, CodeBlobClosure* code_root_cl, uint worker_i); --- old/src/share/vm/gc/g1/heapRegion.cpp 2015-08-12 14:09:30.000000000 +0200 +++ new/src/share/vm/gc/g1/heapRegion.cpp 2015-08-12 14:09:30.000000000 +0200 @@ -361,12 +361,10 @@ return NULL; } -HeapWord* -HeapRegion:: -oops_on_card_seq_iterate_careful(MemRegion mr, - FilterOutOfRegionClosure* cl, - bool filter_young, - jbyte* card_ptr) { + +bool HeapRegion::clean_card(MemRegion& mr, + bool filter_young, + jbyte* &card_ptr) { // Currently, we should only have to clean the card if filter_young // is true and vice versa. if (filter_young) { @@ -384,7 +382,7 @@ } else { mr = mr.intersection(used_region()); } - if (mr.is_empty()) return NULL; + if (mr.is_empty()) return false; // Otherwise, find the obj that extends onto mr.start(). // The intersection of the incoming mr (for the card) and the @@ -394,7 +392,7 @@ // is_young tag on the region before allocating. Thus we // safely know if this region is young. if (is_young() && filter_young) { - return NULL; + return false; } assert(!is_young(), "check value of filter_young"); @@ -404,17 +402,25 @@ // asked to (i.e., card_ptr != NULL). if (card_ptr != NULL) { *card_ptr = CardTableModRefBS::clean_card_val(); - // We must complete this write before we do any of the reads below. - OrderAccess::storeload(); } + return true; +} + +HeapWord* HeapRegion::process_oops_on_card(MemRegion mr, + FilterOutOfRegionClosure *cl, + jbyte *card_ptr) { + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + G1SATBCardTableLoggingModRefBS* bs = g1h->g1_barrier_set(); // Cache the boundaries of the memory region in some const locals HeapWord* const start = mr.start(); HeapWord* const end = mr.end(); + HeapWord* cur; + // We used to use "block_start_careful" here. But we're actually happy // to update the BOT while we do this... - HeapWord* cur = block_start(start); + cur = block_start(start); assert(cur <= start, "Postcondition"); oop obj; @@ -464,6 +470,17 @@ return NULL; } +HeapWord* +HeapRegion:: +oops_on_card_seq_iterate_careful(MemRegion mr, + FilterOutOfRegionClosure* cl, + bool filter_young, + jbyte* card_ptr) { + if (!clean_card(mr, filter_young, card_ptr)) return NULL; + if (card_ptr != NULL) OrderAccess::storeload(); // serialize card cleaning + return process_oops_on_card(mr, cl, card_ptr); +} + // Code roots support void HeapRegion::add_strong_code_root(nmethod* nm) { @@ -1029,4 +1046,3 @@ set_saved_mark_word(NULL); reset_bot(); } - --- old/src/share/vm/gc/g1/heapRegion.hpp 2015-08-12 14:09:31.000000000 +0200 +++ new/src/share/vm/gc/g1/heapRegion.hpp 2015-08-12 14:09:31.000000000 +0200 @@ -715,6 +715,14 @@ HeapWord* object_iterate_mem_careful(MemRegion mr, ObjectClosure* cl); + bool clean_card(MemRegion& mr, + bool filter_young, + jbyte* &card_ptr); + + HeapWord* process_oops_on_card(MemRegion mr, + FilterOutOfRegionClosure *cl, + jbyte *card_ptr); + // filter_young: if true and the region is a young region then we // skip the iteration. // card_ptr: if not NULL, and we decide that the card is not young --- old/src/share/vm/gc/shared/cardTableModRefBS.hpp 2015-08-12 14:09:33.000000000 +0200 +++ new/src/share/vm/gc/shared/cardTableModRefBS.hpp 2015-08-12 14:09:32.000000000 +0200 @@ -43,6 +43,7 @@ class CardTableModRefBS: public ModRefBarrierSet { // Some classes get to look at some private stuff. friend class VMStructs; + friend class G1RemSet; protected: enum CardValues { --- old/src/share/vm/opto/compile.cpp 2015-08-12 14:09:34.000000000 +0200 +++ new/src/share/vm/opto/compile.cpp 2015-08-12 14:09:34.000000000 +0200 @@ -3388,6 +3388,7 @@ return false; } + //-----------------------------too_many_traps---------------------------------- // Report if there are too many traps at the current method and bci. // Return true if there was a trap, and/or PerMethodTrapLimit is exceeded. --- old/src/share/vm/opto/compile.hpp 2015-08-12 14:09:35.000000000 +0200 +++ new/src/share/vm/opto/compile.hpp 2015-08-12 14:09:35.000000000 +0200 @@ -34,6 +34,7 @@ #include "libadt/dict.hpp" #include "libadt/vectset.hpp" #include "memory/resourceArea.hpp" +#include "opto/safepointTable.hpp" #include "opto/idealGraphPrinter.hpp" #include "opto/phasetype.hpp" #include "opto/phase.hpp" @@ -276,6 +277,12 @@ bool can_be_reused() const { return _can_be_reused; } }; +private: + ThreadLocalSafepointTable _tls_table; + +public: + ThreadLocalSafepointTable *tls_table() { return &_tls_table; } + // Constant table. class ConstantTable { private: --- old/src/share/vm/opto/graphKit.cpp 2015-08-12 14:09:36.000000000 +0200 +++ new/src/share/vm/opto/graphKit.cpp 2015-08-12 14:09:36.000000000 +0200 @@ -4185,6 +4185,7 @@ Node* no_base = __ top(); float likely = PROB_LIKELY(0.999); float unlikely = PROB_UNLIKELY(0.999); + Node* clean_card = __ ConI((jint)CardTableModRefBS::clean_card_val()); Node* young_card = __ ConI((jint)G1SATBCardTableModRefBS::g1_young_card_val()); Node* dirty_card = __ ConI((jint)CardTableModRefBS::dirty_card_val()); Node* zeroX = __ ConX(0); @@ -4242,17 +4243,23 @@ // load the original value of the card Node* card_val = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw); - __ if_then(card_val, BoolTest::ne, young_card); { - sync_kit(ideal); - // Use Op_MemBarVolatile to achieve the effect of a StoreLoad barrier. - insert_mem_bar(Op_MemBarVolatile, oop_store); - __ sync_kit(this); - - Node* card_val_reload = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw); - __ if_then(card_val_reload, BoolTest::ne, dirty_card); { + if (G1ElideMembar) { + __ if_then(card_val, BoolTest::eq, clean_card); { g1_mark_card(ideal, card_adr, oop_store, alias_idx, index, index_adr, buffer, tf); } __ end_if(); - } __ end_if(); + } else { + __ if_then(card_val, BoolTest::ne, young_card); { + sync_kit(ideal); + // Use Op_MemBarVolatile to achieve the effect of a StoreLoad barrier. + insert_mem_bar(Op_MemBarVolatile, oop_store); + __ sync_kit(this); + + Node* card_val_reload = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw); + __ if_then(card_val_reload, BoolTest::ne, dirty_card); { + g1_mark_card(ideal, card_adr, oop_store, alias_idx, index, index_adr, buffer, tf); + } __ end_if(); + } __ end_if(); + } } __ end_if(); } __ end_if(); } else { --- old/src/share/vm/opto/output.cpp 2015-08-12 14:09:37.000000000 +0200 +++ new/src/share/vm/opto/output.cpp 2015-08-12 14:09:37.000000000 +0200 @@ -42,6 +42,7 @@ #include "opto/subnode.hpp" #include "opto/type.hpp" #include "runtime/handles.inline.hpp" +#include "runtime/sharedRuntime.hpp" #include "utilities/xmlstream.hpp" #ifndef PRODUCT @@ -1143,7 +1144,7 @@ // class HandlerImpl is platform-specific and defined in the *.ad files. int exception_handler_req = HandlerImpl::size_exception_handler() + MAX_stubs_size; // add marginal slop for handler int deopt_handler_req = HandlerImpl::size_deopt_handler() + MAX_stubs_size; // add marginal slop for handler - stub_req += MAX_stubs_size; // ensure per-stub margin + code_req += tls_table()->stub_size(); // ensure per-stub margin code_req += MAX_inst_size; // ensure per-instruction margin if (StressCodeBuffers) @@ -1634,6 +1635,9 @@ // Fill in exception table entries. FillExceptionTables(inct_cnt, call_returns, inct_starts, blk_labels); + // Fill in stubs for calling the runtime from safepoint polls. + tls_table()->emit(*cb, SharedRuntime::is_wide_vector(max_vector_size())); + // Only java methods have exception handlers and deopt handlers // class HandlerImpl is platform-specific and defined in the *.ad files. if (_method) { --- old/src/share/vm/runtime/arguments.cpp 2015-08-12 14:09:39.000000000 +0200 +++ new/src/share/vm/runtime/arguments.cpp 2015-08-12 14:09:39.000000000 +0200 @@ -36,6 +36,7 @@ #include "memory/allocation.inline.hpp" #include "memory/universe.inline.hpp" #include "oops/oop.inline.hpp" +#include "opto/safepointTable.hpp" #include "prims/jvmtiExport.hpp" #include "runtime/arguments.hpp" #include "runtime/arguments_ext.hpp" @@ -1680,6 +1681,31 @@ FLAG_SET_DEFAULT(GCTimeRatio, 9); } +#ifdef THREAD_LOCAL_SAFEPOINT_SUPPORT + if (!FLAG_IS_DEFAULT(G1ElideMembar) && G1ElideMembar) { + if (!FLAG_IS_DEFAULT(UseMembar) && !UseMembar) { + UseMembar = true; + jio_fprintf(defaultStream::error_stream(), + "When G1ElideMembar is set, UseMembar must also be set\n"); + } else { + UseMembar = true; + } + if (!FLAG_IS_DEFAULT(ThreadLocalSafepoints) && !ThreadLocalSafepoints) { + ThreadLocalSafepoints = true; + jio_fprintf(defaultStream::error_stream(), + "When G1ElideMembar is set, ThreadLocalSafepoints must also be set\n"); + } else { + ThreadLocalSafepoints = true; + } + } +#else + if (G1ElideMembar) { + jio_fprintf(defaultStream::error_stream(), + "G1ElideMembar is not supported on this platform\n"); + G1ElideMembar = false; + } +#endif + if (PrintGCDetails && Verbose) { tty->print_cr("MarkStackSize: %uk MarkStackSizeMax: %uk", (unsigned int) (MarkStackSize / K), (uint) (MarkStackSizeMax / K)); @@ -3955,6 +3981,15 @@ } #endif +#ifndef THREAD_LOCAL_SAFEPOINT_SUPPORT + if (ThreadLocalSafepoints) { + ThreadLocalSafepoints = false; + jio_fprintf(defaultStream::error_stream(), + "ThreadLocalSafepoints is not supported on this platform\n"); + + } +#endif + return JNI_OK; } --- old/src/share/vm/runtime/globals.hpp 2015-08-12 14:09:40.000000000 +0200 +++ new/src/share/vm/runtime/globals.hpp 2015-08-12 14:09:40.000000000 +0200 @@ -647,6 +647,12 @@ develop(bool, CleanChunkPoolAsync, falseInEmbedded, \ "Clean the chunk pool asynchronously") \ \ + product(bool, ThreadLocalSafepoints, false, \ + "Use thread-local safepoints instead of global polling") \ + \ + product(bool, G1ElideMembar, false, \ + "Elide G1 write barrier membar using a handshake") \ + \ experimental(bool, AlwaysSafeConstructors, false, \ "Force safe construction, as if all fields are final.") \ \ --- old/src/share/vm/runtime/osThread.hpp 2015-08-12 14:09:42.000000000 +0200 +++ new/src/share/vm/runtime/osThread.hpp 2015-08-12 14:09:42.000000000 +0200 @@ -125,6 +125,9 @@ // thread has a unique thread_id (BsdThreads or NPTL). It can be used // to access /proc. thread_id_t _thread_id; + + public: + bool is_online(); }; --- old/src/share/vm/runtime/safepoint.cpp 2015-08-12 14:09:43.000000000 +0200 +++ new/src/share/vm/runtime/safepoint.cpp 2015-08-12 14:09:43.000000000 +0200 @@ -182,7 +182,12 @@ // Make interpreter safepoint aware Interpreter::notice_safepoints(); - if (DeferPollingPageLoopCount < 0) { + if (ThreadLocalSafepoints) { + for (JavaThread *cur = Threads::first(); cur != NULL; cur = cur->next()) { + // Make sure the threads start polling it's time to yield. + cur->set_yieldpoint(true); + } + } else if (DeferPollingPageLoopCount < 0) { // Make polling safepoint aware guarantee (PageArmed == 0, "invariant") ; PageArmed = 1 ; @@ -288,7 +293,7 @@ // 9. On windows consider using the return value from SwitchThreadTo() // to drive subsequent spin/SwitchThreadTo()/Sleep(N) decisions. - if (int(iterations) == DeferPollingPageLoopCount) { + if (!ThreadLocalSafepoints && int(iterations) == DeferPollingPageLoopCount) { guarantee (PageArmed == 0, "invariant") ; PageArmed = 1 ; os::make_polling_page_unreadable(); --- old/src/share/vm/runtime/sharedRuntime.cpp 2015-08-12 14:09:44.000000000 +0200 +++ new/src/share/vm/runtime/sharedRuntime.cpp 2015-08-12 14:09:44.000000000 +0200 @@ -94,14 +94,22 @@ _resolve_virtual_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_call_C), "resolve_virtual_call"); _resolve_static_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C), "resolve_static_call"); + address runtime_exception_handler; + + if (ThreadLocalSafepoints) { + runtime_exception_handler = CAST_FROM_FN_PTR(address, SharedRuntime::thread_local_safepoint); + } else { + runtime_exception_handler = CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception); + } + #ifdef COMPILER2 // Vectors are generated only by C2. if (is_wide_vector(MaxVectorSize)) { - _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_VECTOR_LOOP); + _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(runtime_exception_handler, POLL_AT_VECTOR_LOOP); } #endif // COMPILER2 - _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_LOOP); - _polling_page_return_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_RETURN); + _polling_page_safepoint_handler_blob = generate_handler_blob(runtime_exception_handler, POLL_AT_LOOP); + _polling_page_return_handler_blob = generate_handler_blob(runtime_exception_handler, POLL_AT_RETURN); generate_deopt_blob(); @@ -110,6 +118,14 @@ #endif // COMPILER2 } +void SharedRuntime::thread_local_safepoint(JavaThread *thread) { + thread->set_yieldpoint(false); + thread->update_serialized_memory_version(); + if (SafepointSynchronize::is_synchronizing()) { + SafepointSynchronize::handle_polling_page_exception(thread); + } +} + #include // Implementation of SharedRuntime @@ -196,6 +212,7 @@ // G1 write-barrier pre: executed before a pointer store. JRT_LEAF(void, SharedRuntime::g1_wb_pre(oopDesc* orig, JavaThread *thread)) + thread->update_serialized_memory_version(); if (orig == NULL) { assert(false, "should be optimized out"); return; @@ -207,6 +224,7 @@ // G1 write-barrier post: executed after a pointer store. JRT_LEAF(void, SharedRuntime::g1_wb_post(void* card_addr, JavaThread* thread)) + thread->update_serialized_memory_version(); thread->dirty_card_queue().enqueue(card_addr); JRT_END @@ -520,13 +538,6 @@ // Look up the code blob CodeBlob *cb = CodeCache::find_blob(pc); - // Should be an nmethod - assert(cb && cb->is_nmethod(), "safepoint polling: pc must refer to an nmethod"); - - // Look up the relocation information - assert(((nmethod*)cb)->is_at_poll_or_poll_return(pc), - "safepoint polling: type must be poll"); - assert(((NativeInstruction*)pc)->is_safepoint_poll(), "Only polling locations are used for safepoint"); --- old/src/share/vm/runtime/sharedRuntime.hpp 2015-08-12 14:09:45.000000000 +0200 +++ new/src/share/vm/runtime/sharedRuntime.hpp 2015-08-12 14:09:45.000000000 +0200 @@ -175,6 +175,7 @@ // exception handling across interpreter/compiler boundaries static address raw_exception_handler_for_return_address(JavaThread* thread, address return_address); static address exception_handler_for_return_address(JavaThread* thread, address return_address); + static void thread_local_safepoint(JavaThread *thread); #if INCLUDE_ALL_GCS // G1 write barriers --- old/src/share/vm/runtime/thread.cpp 2015-08-12 14:09:46.000000000 +0200 +++ new/src/share/vm/runtime/thread.cpp 2015-08-12 14:09:46.000000000 +0200 @@ -58,6 +58,7 @@ #include "runtime/fprofiler.hpp" #include "runtime/frame.inline.hpp" #include "runtime/globals.hpp" +#include "runtime/globalSynchronizer.hpp" #include "runtime/init.hpp" #include "runtime/interfaceSupport.hpp" #include "runtime/java.hpp" @@ -92,6 +93,7 @@ #include "utilities/defaultStream.hpp" #include "utilities/dtrace.hpp" #include "utilities/events.hpp" +#include "utilities/hashtable.hpp" #include "utilities/macros.hpp" #include "utilities/preserveException.hpp" #if INCLUDE_ALL_GCS @@ -209,6 +211,8 @@ // This initial value ==> never claimed. _oops_do_parity = 0; + _java_threads_do_hp = NULL; + // the handle mark links itself to last_handle_mark new HandleMark(this); @@ -1394,6 +1398,10 @@ // Set the claimed par_id to UINT_MAX (ie not claiming any par_ids) set_claimed_par_id(UINT_MAX); + set_yieldpoint(false); + _serialized_memory_version = GlobalSynchronizer::global_serialized_memory_version(); + _force_yield = false; + set_saved_exception_pc(NULL); set_threadObj(NULL); _anchor.clear(); @@ -1489,6 +1497,15 @@ assert(deferred_card_mark().is_empty(), "Default MemRegion ctor"); } +void JavaThread::update_serialized_memory_version() { + int global_version = GlobalSynchronizer::global_serialized_memory_version(); + int local_version = OrderAccess::load_acquire(&_serialized_memory_version); + if (local_version != global_version) { + assert(local_version < global_version, "sanity"); + OrderAccess::release_store(&_serialized_memory_version, global_version); + } +} + bool JavaThread::reguard_stack(address cur_sp) { if (_stack_guard_state != stack_guard_yellow_disabled) { return true; // Stack already guarded or guard pages not needed. @@ -1526,6 +1543,13 @@ } } +bool JavaThread::is_online_vm() { + return thread_state() == _thread_in_Java; +} + +bool JavaThread::is_online_os() { + return _osthread->is_online(); +} // Remove this ifdef when C1 is ported to the compiler interface. static void compiler_thread_entry(JavaThread* thread, TRAPS); @@ -1664,7 +1688,7 @@ DTRACE_THREAD_PROBE(stop, this); this->exit(false); - delete this; + Threads::smr_free(this, false); } @@ -1936,7 +1960,7 @@ #endif // INCLUDE_ALL_GCS Threads::remove(this); - delete this; + Threads::smr_free(this, false); } @@ -3199,11 +3223,14 @@ // operations from having the thread being operated on from exiting // and going away unexpectedly (e.g., safepoint synchronization) -JavaThread* Threads::_thread_list = NULL; -int Threads::_number_of_threads = 0; -int Threads::_number_of_non_daemon_threads = 0; -int Threads::_return_code = 0; -int Threads::_thread_claim_parity = 0; +JavaThread* Threads::_thread_list = NULL; +JavaThread* Threads::_thread_smr_list = NULL; +JavaThread** Threads::_thread_smr_list_list = NULL; +int Threads::_number_of_threads = 0; +int Threads::_number_of_non_daemon_threads = 0; +int Threads::_return_code = 0; +int Threads::_thread_claim_parity = 0; +JavaThread **volatile Threads::_fast_java_thread_list = NULL; size_t JavaThread::_stack_size_at_create = 0; #ifdef ASSERT bool Threads::_vm_complete = false; @@ -3238,6 +3265,22 @@ // If CompilerThreads ever become non-JavaThreads, add them here } +void Threads::java_threads_do_fast(ThreadClosure *tc, Thread *self) { + JavaThread **threads; + + // Stable load of thread list w.r.t. hazard pointer for SMR + do { + threads = (JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&_fast_java_thread_list); + OrderAccess::release_store_ptr_fence((volatile void*)&self->_java_threads_do_hp, (void*)threads); + } while ((JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&_fast_java_thread_list) != threads); + if (threads == NULL) return; + + for (JavaThread **current = threads; *current != NULL; current++) { + tc->do_thread(*current); + } + OrderAccess::release_store_ptr_fence((volatile void*)&self->_java_threads_do_hp, NULL); +} + void Threads::initialize_java_lang_classes(JavaThread* main_thread, TRAPS) { TraceTime timer("Initialize java.lang classes", TraceStartupTime); @@ -3397,7 +3440,7 @@ if (!main_thread->set_as_starting_thread()) { vm_shutdown_during_initialization( "Failed necessary internal allocation. Out of swap space"); - delete main_thread; + smr_free(main_thread, false); *canTryAgain = false; // don't let caller call JNI_CreateJavaVM again return JNI_ENOMEM; } @@ -3412,7 +3455,7 @@ // Initialize global modules jint status = init_globals(); if (status != JNI_OK) { - delete main_thread; + smr_free(main_thread, false); *canTryAgain = false; // don't let caller call JNI_CreateJavaVM again return status; } @@ -3962,7 +4005,7 @@ notify_vm_shutdown(); - delete thread; + smr_free(thread, true); // exit_globals() will delete tty exit_globals(); @@ -3985,6 +4028,192 @@ return JNI_FALSE; } +class ThreadScanEntry: public BasicHashtableEntry { +public: + void *_pointer; + + ThreadScanEntry* next() { + return (ThreadScanEntry*)BasicHashtableEntry::next(); + } + + const void* pointer() { return _pointer; } + void set_pointer(void* pointer) { _pointer = pointer; } +}; + +class ThreadScanHashtable : public BasicHashtable { +private: + inline unsigned int compute_hash(void* pointer) { + return (unsigned int)(((uint32_t)(uintptr_t)pointer) * 2654435761u); + } + + ThreadScanEntry* bucket(int index) { + return (ThreadScanEntry*)BasicHashtable::bucket(index); + } + + ThreadScanEntry* get_entry(int index, unsigned int hash, void *pointer) { + for (ThreadScanEntry* pp = bucket(index); pp != NULL; pp = pp->next()) { + if (pp->hash() == hash && + pp->pointer() == pointer) { + return pp; + } + } + return NULL; + } + +public: + ThreadScanHashtable(int table_size) + : BasicHashtable(table_size, sizeof(ThreadScanEntry)) {} + + ThreadScanEntry* get_entry(void *pointer) { + unsigned int hash = compute_hash(pointer); + return get_entry(hash_to_index(hash), hash, pointer); + } + + ThreadScanEntry* new_entry(void *pointer) { + unsigned int hash = compute_hash(pointer); + ThreadScanEntry* pp; + pp = (ThreadScanEntry*)BasicHashtable::new_entry(hash); + pp->set_pointer(pointer); + return pp; + } + + void add_entry(ThreadScanEntry* pp) { + int index = hash_to_index(pp->hash()); + BasicHashtable::add_entry(index, pp); + } +}; + +class ScanHazardPointerThreadClosure: public ThreadClosure { +private: + ThreadScanHashtable *_table; +public: + ScanHazardPointerThreadClosure(ThreadScanHashtable *table) : _table(table) {} + + virtual void do_thread(Thread *thread) { + assert_locked_or_safepoint(Threads_lock); + assert(thread->is_Java_thread(), "sanity"); + JavaThread *const jthread = reinterpret_cast(thread); + JavaThread **threads = (JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&jthread->_java_threads_do_hp); + if (threads == NULL) return; + for (JavaThread** current = threads; *current != NULL; current++) { + JavaThread *p = *current; + if (_table->get_entry((void*)p) == NULL) { + _table->add_entry(_table->new_entry((void*)p)); + } + } + } +}; + +void Threads::smr_free(JavaThread *thread, bool have_lock) { + assert(!have_lock || Threads_lock->is_locked(), "Threads_lock inconsistency"); + JavaThread *delete_head; + if (!have_lock) { + MutexLocker ml(Threads_lock); + delete_head = smr_free_work(thread); + } else { + delete_head = smr_free_work(thread); + } + + while (delete_head != NULL) { + JavaThread *next = delete_head->next(); + delete delete_head; + delete_head = next; + } +} + +JavaThread *Threads::smr_free_work(JavaThread *thread) { + assert(Threads_lock->is_locked(), "Threads_lock should be locked"); + + thread->set_next(_thread_smr_list); + _thread_smr_list = thread; + + JavaThread *current = _thread_smr_list; + JavaThread *prev = NULL; + JavaThread *next = NULL; + JavaThread *delete_head = NULL; + + ThreadScanHashtable *scan_table = new ThreadScanHashtable(32); + ScanHazardPointerThreadClosure scan_cl(scan_table); + ALL_JAVA_THREADS(q) { + scan_cl.do_thread(q); + } + + while (current != NULL) { + next = current->next(); + if (!scan_table->get_entry((void*)current)) { + if (prev != NULL) { + prev->set_next(next); + } + if (_thread_smr_list == current) _thread_smr_list = next; + + current->set_next(delete_head); + delete_head = current; + } else { + prev = current; + } + + current = next; + } + + delete scan_table; + + return delete_head; +} + +class ScanHazardPointerThreadsClosure: public ThreadClosure { +private: + ThreadScanHashtable *_table; +public: + ScanHazardPointerThreadsClosure(ThreadScanHashtable *table) : _table(table) {} + + virtual void do_thread(Thread *thread) { + assert_locked_or_safepoint(Threads_lock); + assert(thread->is_Java_thread(), "sanity"); + JavaThread *const jthread = reinterpret_cast(thread); + JavaThread **threads = (JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&jthread->_java_threads_do_hp); + if (threads == NULL) return; + if (_table->get_entry((void*)threads) == NULL) { + _table->add_entry(_table->new_entry((void*)threads)); + } + } +}; + +void Threads::smr_free_list(JavaThread **threads) { + assert(Threads_lock->is_locked(), "Threads_lock should be locked"); + + JavaThread ***threads_header_addr = (JavaThread***)(threads - 1); + + *threads_header_addr = _thread_smr_list_list; + _thread_smr_list_list = threads; + + JavaThread **current = _thread_smr_list_list; + JavaThread **prev = NULL; + JavaThread **next = NULL; + + ThreadScanHashtable *scan_table = new ThreadScanHashtable(32); + ScanHazardPointerThreadsClosure scan_cl(scan_table); + ALL_JAVA_THREADS(q) { + scan_cl.do_thread(q); + } + + while (current != NULL) { + JavaThread ***current_header_addr = (JavaThread***)(current - 1); + next = *current_header_addr; + if (!scan_table->get_entry((void*)current)) { + if (prev != NULL) { + JavaThread ***prev_header_addr = (JavaThread***)(prev - 1); + // prev->next = current->next + *prev_header_addr = *current_header_addr; + } + if (_thread_smr_list_list == current) _thread_smr_list_list = *current_header_addr; + FREE_C_HEAP_ARRAY(JavaThread*, (JavaThread**)current_header_addr); + } else { + prev = current; + } + + current = next; + } +} void Threads::add(JavaThread* p, bool force_daemon) { // The threads lock must be owned at this point @@ -4007,6 +4236,15 @@ ThreadService::add_thread(p, daemon); + JavaThread **new_thread_list = NEW_C_HEAP_ARRAY(JavaThread*, _number_of_threads + 2, mtThread) + 1; + int i = 0; + ALL_JAVA_THREADS(q) { + new_thread_list[i++] = q; + } + new_thread_list[i] = NULL; + JavaThread **old_list = (JavaThread**)Atomic::xchg_ptr((void*)new_thread_list, (volatile void*)&_fast_java_thread_list); + if (old_list != NULL) smr_free_list(old_list); + // Possible GC point. Events::log(p, "Thread added: " INTPTR_FORMAT, p); } @@ -4021,7 +4259,11 @@ JavaThread* current = _thread_list; JavaThread* prev = NULL; + JavaThread **new_thread_list = NEW_C_HEAP_ARRAY(JavaThread*, _number_of_threads + 1, mtThread) + 1; + int i = 0; + while (current != p) { + new_thread_list[i++] = current; prev = current; current = current->next(); } @@ -4031,6 +4273,16 @@ } else { _thread_list = p->next(); } + + current = current->next(); + while (current != NULL) { + new_thread_list[i++] = current; + current = current->next(); + } + new_thread_list[i] = NULL; + JavaThread **old_list = (JavaThread**)Atomic::xchg_ptr((void*)new_thread_list, (volatile void*)&_fast_java_thread_list); + if (old_list != NULL) smr_free_list(old_list); + _number_of_threads--; oop threadObj = p->threadObj(); bool daemon = true; --- old/src/share/vm/runtime/thread.hpp 2015-08-12 14:09:48.000000000 +0200 +++ new/src/share/vm/runtime/thread.hpp 2015-08-12 14:09:48.000000000 +0200 @@ -101,6 +101,9 @@ class Thread: public ThreadShadow { friend class VMStructs; + friend class Threads; + friend class ScanHazardPointerThreadClosure; + friend class ScanHazardPointerThreadsClosure; private: // Exception handling // (Note: _pending_exception and friends are in ThreadShadow) @@ -237,6 +240,8 @@ // claimed as a task. jint _oops_do_parity; + JavaThread **volatile _java_threads_do_hp; + public: void set_last_handle_mark(HandleMark* mark) { _last_handle_mark = mark; } HandleMark* last_handle_mark() const { return _last_handle_mark; } @@ -588,6 +593,8 @@ static ByteSize exception_line_offset() { return byte_offset_of(Thread, _exception_line); } static ByteSize active_handles_offset() { return byte_offset_of(Thread, _active_handles); } + static ByteSize yieldpoint_offset() { return byte_offset_of(Thread, _yieldpoint_poll); } + static ByteSize stack_base_offset() { return byte_offset_of(Thread, _stack_base); } static ByteSize stack_size_offset() { return byte_offset_of(Thread, _stack_size); } @@ -936,6 +943,19 @@ } _jmp_ring[jump_ring_buffer_size]; #endif // PRODUCT +private: + volatile int _serialized_memory_version; + volatile bool _force_yield; + +public: + int serialized_memory_version() { return _serialized_memory_version; } + void update_serialized_memory_version(); + + void set_force_yield() { _force_yield = true; } + + bool is_online_vm(); + bool is_online_os(); + #if INCLUDE_ALL_GCS // Support for G1 barriers @@ -1866,17 +1886,23 @@ class Threads: AllStatic { friend class VMStructs; private: - static JavaThread* _thread_list; - static int _number_of_threads; - static int _number_of_non_daemon_threads; - static int _return_code; - static int _thread_claim_parity; + static JavaThread* _thread_list; + static JavaThread* _thread_smr_list; + static JavaThread** _thread_smr_list_list; + static int _number_of_threads; + static int _number_of_non_daemon_threads; + static int _return_code; + static int _thread_claim_parity; #ifdef ASSERT - static bool _vm_complete; + static bool _vm_complete; #endif + static JavaThread **volatile _fast_java_thread_list; + static void initialize_java_lang_classes(JavaThread* main_thread, TRAPS); static void initialize_jsr292_core_classes(TRAPS); + static JavaThread *smr_free_work(JavaThread *thread); + static void smr_free_list(JavaThread **threads); public: // Thread management // force_daemon is a concession to JNI, where we may need to add a @@ -1887,6 +1913,9 @@ static JavaThread* first() { return _thread_list; } static void threads_do(ThreadClosure* tc); + static void java_threads_do_fast(ThreadClosure *tc, Thread *self); + static void smr_free(JavaThread *thread, bool have_lock); + // Initializes the vm and creates the vm thread static jint create_vm(JavaVMInitArgs* args, bool* canTryAgain); static void convert_vm_init_libraries_to_agents(); --- old/src/share/vm/utilities/exceptions.hpp 2015-08-12 14:09:49.000000000 +0200 +++ new/src/share/vm/utilities/exceptions.hpp 2015-08-12 14:09:49.000000000 +0200 @@ -59,8 +59,9 @@ class ThreadShadow: public CHeapObj { friend class VMStructs; - protected: + char _yieldpoint_poll; + char _yieldpoint_spill[wordSize - 1]; oop _pending_exception; // Thread has gc actions. const char* _exception_file; // file information for exception (debugging only) int _exception_line; // line information for exception (debugging only) @@ -90,7 +91,15 @@ void clear_pending_exception(); ThreadShadow() : _pending_exception(NULL), - _exception_file(NULL), _exception_line(0) {} + _exception_file(NULL), _exception_line(0), _yieldpoint_poll(3) {} + + void set_yieldpoint(bool should_take_yieldpoint) { + _yieldpoint_poll = should_take_yieldpoint ? 0 : 3; + } + + bool yieldpoint() { + return _yieldpoint_poll == 3; + } }; --- old/src/share/vm/utilities/hashtable.cpp 2015-08-12 14:09:51.000000000 +0200 +++ new/src/share/vm/utilities/hashtable.cpp 2015-08-12 14:09:50.000000000 +0200 @@ -381,4 +381,5 @@ template class BasicHashtable; template class BasicHashtable; template class BasicHashtable; +template class BasicHashtable; template class BasicHashtable; --- /dev/null 2015-08-12 14:09:52.000000000 +0200 +++ new/src/cpu/x86/vm/c2_safepointTable_x86_64.cpp 2015-08-12 14:09:51.000000000 +0200 @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "opto/compile.hpp" +#include "opto/node.hpp" +#include "opto/safepointTable.hpp" +#include "runtime/sharedRuntime.hpp" + +Label &ThreadLocalSafepointTable::add_safepoint(InternalAddress safepoint_addr, bool is_return) { + ThreadLocalSafepointEntry *entry = new (Compile::current()->comp_arena()) ThreadLocalSafepointEntry(safepoint_addr, is_return); + int index = _safepoints.append(entry); + return _safepoints.at(index)->_stub_label; +} + +int ThreadLocalSafepointTable::stub_size() { + return _safepoints.length() * 14 * 2; +} + +#define __ _masm. +void ThreadLocalSafepointTable::emit(CodeBuffer& cbuf, bool has_wide_vectors) { + //cb->insts()->freeze(); + + MacroAssembler _masm(&cbuf); + + for (int i = _safepoints.length() - 1; i >= 0; i--) { + ThreadLocalSafepointEntry &entry = *_safepoints.at(i); + + __ bind(entry._stub_label); + __ push(rax); + __ lea(rax, entry._safepoint_addr); + __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rax); + __ pop(rax); + + address stub; + + if (entry._is_return) { + assert(SharedRuntime::polling_page_return_handler_blob() != NULL, + "polling page return stub not created yet"); + stub = SharedRuntime::polling_page_return_handler_blob()->entry_point(); + } else if (has_wide_vectors) { + assert(SharedRuntime::polling_page_vectors_safepoint_handler_blob() != NULL, + "polling page safepoint stub not created yet"); + stub = SharedRuntime::polling_page_vectors_safepoint_handler_blob()->entry_point(); + } else { + assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL, + "polling page safepoint stub not created yet"); + stub = SharedRuntime::polling_page_safepoint_handler_blob()->entry_point(); + } + + RuntimeAddress callback_addr(stub); + + __ jump(callback_addr); + } + +} +#undef __ --- /dev/null 2015-08-12 14:09:53.000000000 +0200 +++ new/src/cpu/x86/vm/c2_safepointTable_x86_64.hpp 2015-08-12 14:09:52.000000000 +0200 @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef CPU_X86_VM_C2_SAFEPOINTTABLE_x86_64_HPP +#define CPU_X86_VM_C2_SAFEPOINTTABLE_x86_64_HPP + +#define THREAD_LOCAL_SAFEPOINT_SUPPORT 1 + +#include "asm/macroAssembler.hpp" +#include "utilities/growableArray.hpp" + +class ThreadLocalSafepointTable { +private: + struct ThreadLocalSafepointEntry : public ResourceObj { + InternalAddress _safepoint_addr; + Label _stub_label; + bool _is_return; + ThreadLocalSafepointEntry(InternalAddress safepoint_addr, bool is_return) : _safepoint_addr(safepoint_addr), _is_return(is_return) {} + }; + GrowableArray _safepoints; + +public: + Label &add_safepoint(InternalAddress safepoint_addr, bool is_return); + + int stub_size(); + void emit(CodeBuffer &cb, bool has_wide_vectors); +}; + +#endif /* CPU_X86_VM_C2_SAFEPOINTTABLE_x86_64_HPP */ --- /dev/null 2015-08-12 14:09:54.000000000 +0200 +++ new/src/share/vm/opto/safepointTable.hpp 2015-08-12 14:09:53.000000000 +0200 @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_VM_OPTO_SAFEPOINTTABLE_HPP +#define SHARE_VM_OPTO_SAFEPOINTTABLE_HPP + +#if defined(AMD64) +#include "c2_safepointTable_x86_64.hpp" +#else +class ThreadLocalSafepointTable { +public: + int stub_size() { return 0; } + void emit(CodeBuffer &cb, bool has_wide_vectors) {} +}; +#endif + +#endif /* SHARE_VM_OPTO_SAFEPOINTTABLE_HPP */ --- /dev/null 2015-08-12 14:09:55.000000000 +0200 +++ new/src/share/vm/runtime/globalSynchronizer.cpp 2015-08-12 14:09:54.000000000 +0200 @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "runtime/atomic.hpp" +#include "runtime/globalSynchronizer.hpp" +#include "runtime/thread.inline.hpp" + +#ifndef MIN +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#endif + +volatile int GlobalSynchronizer::_global_serialized_memory_version = 0; +volatile int GlobalSynchronizer::_latest_global_serialized_memory_version = 0; + +int GlobalSynchronizer::global_serialized_memory_version() { + return _global_serialized_memory_version; +} + +GlobalSynchronizer::GlobalSynchronizer(UrgencyLevel start_urgency, UrgencyLevel max_urgency) + : _current_urgency(start_urgency), _max_urgency(max_urgency) { + assert(max_urgency >= start_urgency, "sanity"); + assert(start_urgency >= UrgencyLevel1 && start_urgency <= UrgencyLevelMax, "sanity"); +} + +GlobalSynchronizer::~GlobalSynchronizer() { +} + +void GlobalSynchronizer::start_synchronizing() { + assert(ThreadLocalSafepoints, "sanity"); + _local_serialized_memory_version = Atomic::add(1, &_global_serialized_memory_version); +} + +bool GlobalSynchronizer::increase_urgency() { + if (_current_urgency + 1 < _max_urgency) { + _current_urgency = (UrgencyLevel)(int(_current_urgency) + 1); + return true; + } else { + return false; + } +} + +void GlobalSynchronizer::maximize_urgency() { + _current_urgency = _max_urgency; +} + +void GlobalSynchronizer::threads_do(ThreadClosure *cl) { + Threads::java_threads_do_fast(cl, Thread::current()); +} + +class GSHasFinishedThreadClosure : public ThreadClosure { +private: + int _needed_version; + int _min_agreed_version; + bool _check_thread_state; + +public: + GSHasFinishedThreadClosure(int version, bool check_thread_state) : _needed_version(version), _min_agreed_version(INT_MAX), _check_thread_state(check_thread_state) {} + + virtual void do_thread(Thread *thread) { + JavaThread *jthread = reinterpret_cast(thread); + int thread_version = jthread->serialized_memory_version(); + if (thread_version < _needed_version) { + if (!jthread->is_online_vm()) _min_agreed_version = MIN(_needed_version, _min_agreed_version); + else if (_check_thread_state && !jthread->is_online_os()) _min_agreed_version = MIN(_needed_version, _min_agreed_version); + else _min_agreed_version = MIN(thread_version, _min_agreed_version); + } else { + _min_agreed_version = MIN(thread_version, _min_agreed_version); + } + } + + bool did_synchronize() { return _min_agreed_version >= _needed_version; } + void fixup_global_version() { + int global_version = GlobalSynchronizer::_latest_global_serialized_memory_version; + if (global_version < _min_agreed_version) { + (void) Atomic::cmpxchg(_min_agreed_version, &GlobalSynchronizer::_latest_global_serialized_memory_version, global_version); + } + } +}; + +class GSSetYieldpointThreadClosure : public ThreadClosure { + const int _target_version; + const bool _force_yields; +public: + GSSetYieldpointThreadClosure(bool force_yields, int target_version) : _force_yields(force_yields), _target_version(target_version) {} + + virtual void do_thread(Thread *thread) { + JavaThread *const jthread = (JavaThread*)thread; + if (jthread->serialized_memory_version() >= _target_version) return; + if (_force_yields) jthread->set_force_yield(); + jthread->set_yieldpoint(true); + } +}; + +bool GlobalSynchronizer::try_synchronize() { + Thread *thread = Thread::current(); + if (thread->is_Java_thread()) { + JavaThread *jthread = reinterpret_cast(thread); + jthread->update_serialized_memory_version(); + } + + if (_latest_global_serialized_memory_version >= _local_serialized_memory_version) { + return true; + } + + GSHasFinishedThreadClosure cl(_local_serialized_memory_version, /* check_thread_state */ _current_urgency >= UrgencyLevel2); + threads_do(&cl); + if (cl.did_synchronize()) { + cl.fixup_global_version(); + return true; + } + + switch (_current_urgency) { + case UrgencyLevel3: + case UrgencyLevel4: { + GSSetYieldpointThreadClosure cl(false, _local_serialized_memory_version); + threads_do(&cl); + return false; + } + default: return false; + } +} + +void GlobalSynchronizer::synchronize() { + while (!try_synchronize()) os::naked_yield(); +} --- /dev/null 2015-08-12 14:09:56.000000000 +0200 +++ new/src/share/vm/runtime/globalSynchronizer.hpp 2015-08-12 14:09:55.000000000 +0200 @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + + +#ifndef SHARE_VM_RUNTIME_GLOBALSYNCHRONIZER_HPP +#define SHARE_VM_RUNTIME_GLOBALSYNCHRONIZER_HPP + +#include "memory/allocation.hpp" + +class JavaThread; +class ThreadClosure; + +// This class is used to coordinate global synchronization among mutator threads. +// It may be used in a lazy asynchronous way to reduce global overheads of the mechanism. + +class GlobalSynchronizer { + friend class GSHasFinishedThreadClosure; +public: + enum UrgencyLevel { + UrgencyLevel1, + // Hope for runtime to respond willingly + + UrgencyLevel2, + // Arm thread-local yieldpoints for forced handshaking + + UrgencyLevel3, + // Force running threads to yield to complete the handshake. + // May also check if threads are ONPROC when this information is available + + UrgencyLevel4, + // Enforce global synchronization to finish with whatever means necessary + // and available on the platform, including IPI + + UrgencyLevelMax = UrgencyLevel4 + }; + +private: + UrgencyLevel _current_urgency; + UrgencyLevel _max_urgency; + int _threads_left; + int _local_serialized_memory_version; + +private: + void threads_do(ThreadClosure *cl); + +private: + static volatile int _global_serialized_memory_version; + static volatile int _latest_global_serialized_memory_version; + +public: + static int global_serialized_memory_version(); + +public: + virtual ~GlobalSynchronizer(); + GlobalSynchronizer(UrgencyLevel start_urgency = UrgencyLevel1, UrgencyLevel max_urgency = UrgencyLevel4); + + // Starts the synchronization process + void start_synchronizing(); + + bool increase_urgency(); + void maximize_urgency(); + + bool try_synchronize(); // For less urgent more scalable synchronization + void synchronize(); // For aggressive blocking synchronization +}; + +template +class SynchronizerObj: public GlobalSynchronizer, public CHeapObj { +public: + SynchronizerObj(UrgencyLevel start_urgency = UrgencyLevel1, UrgencyLevel max_urgency = UrgencyLevel4) + : GlobalSynchronizer(start_urgency, max_urgency) {} +}; + +#endif // SHARE_VM_RUNTIME_GLOBALSYNCHRONIZER_HPP