# HG changeset patch # Parent cb2d18ff1bbf74a2a58925f8ad3af2713a7bca7e diff --git a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp --- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp +++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp @@ -355,6 +355,7 @@ _alloc_seq_at_last_gc_start(0), _alloc_seq_at_last_gc_end(0), _safepoint_workers(NULL), + _stalled_threads(0), #ifdef ASSERT _heap_expansion_count(0), #endif @@ -1586,15 +1587,68 @@ concurrent_thread()->try_set_full_gc(); cancel_concgc(_oom_evacuation); - if ((! Thread::current()->is_GC_task_thread()) && (! Thread::current()->is_ConcurrentGC_thread())) { - assert(! Threads_lock->owned_by_self() - || SafepointSynchronize::is_at_safepoint(), "must not hold Threads_lock here"); - log_warning(gc)("OOM during evacuation. Let Java thread wait until evacuation finishes."); - while (is_evacuation_in_progress()) { // wait. - Thread::current()->_ParkEvent->park(1); + if (Thread::current()->is_Java_thread()) { + switch (JavaThread::current()->thread_state()) { + case _thread_in_Java: { + /* + This method is called from WB leaf call, and handles evacuation failure + during WB execution. Since this is a leaf call, we cannot actually + block on any interesting mutex. Instead, we are left actively waiting + for transition to failing STW operation to begin. + + In doing so, we have to make sure that we don't leave until we can + guarantee to-space invariant holds. In basic case, WB failure would + dereference via fwdptr after getting from here, in case some other + thread had managed to evacuate the same object. This is possible + because other threads might still have space in their GCLABs. + + In more advanced case, there is a race due to non-atomic + evac_in_progress transition. Consider thread A is stuck here waiting + for the evacuation to be over. Control thread drops + evacuation_in_progress preparing for Full GC. Thread B misses that + update, and successfuly evacuates the object, does the write to + to-copy. But, before Thread B is able to install the fwdptr, thread A + discovers evac_in_progress is down, exits from here, reads the fwdptr, + discovers old from-copy, and stores there. Thread B then wakes up and + installs to-copy. This breaks to-space invariant, and silently + corrupts the heap: we accepted two writes to separate copies of the + object. + + The way out is to wait for all threads to park at safepoint for + Full GC, before returning from here. The intent is that all threads + that cannot proceed with evacuation would end up here, and we will + transit to Full GC when that happens. This should guarantee no other + thread is able to evac the object in question under our feet, without + this thread ever knowing about it. + */ + assert(! Threads_lock->owned_by_self() + || SafepointSynchronize::is_at_safepoint(), "must not hold Threads_lock here"); + log_warning(gc)("OOM during evacuation. Let Java thread wait until evacuation finishes."); + increase_stalled_threads(); + + // Wait until all GC workers have settled down. + while (is_evacuation_in_progress()) { // wait. + Thread::current()->_ParkEvent->park(1); + } + + // Wait until all Java threads that are not stalled in oom_during_evacuation() + // arrived at safepoint. + while (stalled_threads() != SafepointSynchronize::still_running()) { // wait. + Thread::current()->_ParkEvent->park(1); + } + decrease_stalled_threads(); + break; + } + case _thread_in_vm: { + // Immediately safepoint. + ThreadBlockInVM block(JavaThread::current()); + break; + } + default: + JavaThread::current()->print_thread_state_on(tty); + ShouldNotReachHere(); } } - } HeapWord* ShenandoahHeap::tlab_post_allocation_setup(HeapWord* obj) { diff --git a/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp b/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp --- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp +++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp @@ -198,6 +198,8 @@ ShenandoahSharedEnumFlag _cancelled_concgc; + volatile int _stalled_threads; + ReferenceProcessor* _ref_processor; ShenandoahForwardedIsAliveClosure _forwarded_is_alive; @@ -337,6 +339,16 @@ inline bool is_concurrent_partial_in_progress() const; inline bool need_update_refs() const; + int stalled_threads() { return OrderAccess::load_acquire(&_stalled_threads); } + void increase_stalled_threads() { + Atomic::inc(&_stalled_threads); + OrderAccess::fence(); + } + void decrease_stalled_threads() { + Atomic::dec(&_stalled_threads); + OrderAccess::fence(); + } + inline bool region_in_collection_set(size_t region_index) const; // Mainly there to avoid accidentally calling the templated diff --git a/src/hotspot/share/runtime/safepoint.cpp b/src/hotspot/share/runtime/safepoint.cpp --- a/src/hotspot/share/runtime/safepoint.cpp +++ b/src/hotspot/share/runtime/safepoint.cpp @@ -73,6 +73,7 @@ SafepointSynchronize::SynchronizeState volatile SafepointSynchronize::_state = SafepointSynchronize::_not_synchronized; volatile int SafepointSynchronize::_waiting_to_block = 0; +volatile int SafepointSynchronize::_still_running = 0; volatile int SafepointSynchronize::_safepoint_counter = 0; int SafepointSynchronize::_current_jni_active_count = 0; long SafepointSynchronize::_end_of_last_safepoint = 0; @@ -114,6 +115,7 @@ _waiting_to_block = nof_threads; TryingToBlock = 0 ; int still_running = nof_threads; + OrderAccess::release_store(&_still_running, still_running); // Save the starting time, so that it can be compared to see if this has taken // too long to complete. @@ -211,6 +213,7 @@ cur_state->examine_state_of_thread(); if (!cur_state->is_running()) { still_running--; + OrderAccess::release_store(&_still_running, still_running); // consider adjusting steps downward: // steps = 0 // steps -= NNN @@ -316,6 +319,7 @@ assert(iterations < (uint)max_jint, "We have been iterating in the safepoint loop too long"); } assert(still_running == 0, "sanity check"); + assert(SafepointSynchronize::still_running() == 0, "sanity check"); if (PrintSafepointStatistics) { update_statistics_on_spin_end(); diff --git a/src/hotspot/share/runtime/safepoint.hpp b/src/hotspot/share/runtime/safepoint.hpp --- a/src/hotspot/share/runtime/safepoint.hpp +++ b/src/hotspot/share/runtime/safepoint.hpp @@ -106,6 +106,7 @@ private: static volatile SynchronizeState _state; // Threads might read this flag directly, without acquiring the Threads_lock static volatile int _waiting_to_block; // number of threads we are waiting for to block + static volatile int _still_running; // number of threads that are still running static int _current_jni_active_count; // Counts the number of active critical natives during the safepoint // This counter is used for fast versions of jni_GetField. @@ -173,6 +174,8 @@ static void block(JavaThread *thread); static void signal_thread_at_safepoint() { _waiting_to_block--; } + static int still_running() { return OrderAccess::load_acquire(&_still_running); } + // Exception handling for page polling static void handle_polling_page_exception(JavaThread *thread);