--- old/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp 2014-04-15 11:59:17.922844136 +0200 +++ new/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp 2014-04-15 11:59:17.846844139 +0200 @@ -29,7 +29,7 @@ #include "gc_implementation/g1/g1HotCardCache.hpp" #include "runtime/java.hpp" -ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h) : +ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure) : _threads(NULL), _n_threads(0), _hot_card_cache(g1h) { @@ -61,7 +61,7 @@ ConcurrentG1RefineThread *next = NULL; for (uint i = _n_threads - 1; i != UINT_MAX; i--) { - ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i); + ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, refine_closure, worker_id_offset, i); assert(t != NULL, "Conc refine should have been created"); if (t->osthread() == NULL) { vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread"); --- old/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp 2014-04-15 11:59:18.362844123 +0200 +++ new/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp 2014-04-15 11:59:18.290844125 +0200 @@ -71,7 +71,7 @@ void reset_threshold_step(); public: - ConcurrentG1Refine(G1CollectedHeap* g1h); + ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure); ~ConcurrentG1Refine(); void init(); // Accomplish some initialization that has to wait. --- old/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp 2014-04-15 11:59:18.782844109 +0200 +++ new/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp 2014-04-15 11:59:18.706844112 +0200 @@ -33,8 +33,10 @@ ConcurrentG1RefineThread:: ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread *next, + CardTableEntryClosure* refine_closure, uint worker_id_offset, uint worker_id) : ConcurrentGCThread(), + _refine_closure(refine_closure), _worker_id_offset(worker_id_offset), _worker_id(worker_id), _active(false), @@ -204,7 +206,7 @@ if (_next != NULL && !_next->is_active() && curr_buffer_num > _next->_threshold) { _next->activate(); } - } while (dcqs.apply_closure_to_completed_buffer(_worker_id + _worker_id_offset, cg1r()->green_zone())); + } while (dcqs.apply_closure_to_completed_buffer(_refine_closure, _worker_id + _worker_id_offset, cg1r()->green_zone())); // We can exit the loop above while being active if there was a yield request. if (is_active()) { --- old/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp 2014-04-15 11:59:19.306844093 +0200 +++ new/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp 2014-04-15 11:59:19.230844095 +0200 @@ -28,6 +28,7 @@ #include "gc_implementation/shared/concurrentGCThread.hpp" // Forward Decl. +class CardTableEntryClosure; class ConcurrentG1Refine; // The G1 Concurrent Refinement Thread (could be several in the future). @@ -49,6 +50,9 @@ Monitor* _monitor; ConcurrentG1Refine* _cg1r; + // The closure applied to completed log buffers. + CardTableEntryClosure* _refine_closure; + int _thread_threshold_step; // This thread activation threshold int _threshold; @@ -68,6 +72,7 @@ virtual void run(); // Constructor ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread* next, + CardTableEntryClosure* refine_closure, uint worker_id_offset, uint worker_id); void initialize(); --- old/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp 2014-04-15 11:59:19.842844076 +0200 +++ new/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp 2014-04-15 11:59:19.738844079 +0200 @@ -70,7 +70,7 @@ DirtyCardQueueSet::DirtyCardQueueSet(bool notify_when_complete) : PtrQueueSet(notify_when_complete), - _closure(NULL), + _mut_process_closure(NULL), _shared_dirty_card_queue(this, true /*perm*/), _free_ids(NULL), _processed_buffers_mut(0), _processed_buffers_rs_thread(0) @@ -83,10 +83,11 @@ return (uint)os::processor_count(); } -void DirtyCardQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock, +void DirtyCardQueueSet::initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock, int process_completed_threshold, int max_completed_queue, Mutex* lock, PtrQueueSet* fl_owner) { + _mut_process_closure = cl; PtrQueueSet::initialize(cbl_mon, fl_lock, process_completed_threshold, max_completed_queue, fl_owner); set_buffer_size(G1UpdateBufferSize); @@ -98,18 +99,15 @@ t->dirty_card_queue().handle_zero_index(); } -void DirtyCardQueueSet::set_closure(CardTableEntryClosure* closure) { - _closure = closure; -} - -void DirtyCardQueueSet::iterate_closure_all_threads(bool consume, +void DirtyCardQueueSet::iterate_closure_all_threads(CardTableEntryClosure* cl, + bool consume, uint worker_i) { assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint."); for(JavaThread* t = Threads::first(); t; t = t->next()) { - bool b = t->dirty_card_queue().apply_closure(_closure, consume); + bool b = t->dirty_card_queue().apply_closure(cl, consume); guarantee(b, "Should not be interrupted."); } - bool b = shared_dirty_card_queue()->apply_closure(_closure, + bool b = shared_dirty_card_queue()->apply_closure(cl, consume, worker_i); guarantee(b, "Should not be interrupted."); @@ -143,7 +141,7 @@ bool b = false; if (worker_i != UINT_MAX) { - b = DirtyCardQueue::apply_closure_to_buffer(_closure, buf, 0, + b = DirtyCardQueue::apply_closure_to_buffer(_mut_process_closure, buf, 0, _sz, true, worker_i); if (b) Atomic::inc(&_processed_buffers_mut); @@ -218,18 +216,11 @@ return res; } -bool DirtyCardQueueSet::apply_closure_to_completed_buffer(uint worker_i, - int stop_at, - bool during_pause) { - return apply_closure_to_completed_buffer(_closure, worker_i, - stop_at, during_pause); -} - -void DirtyCardQueueSet::apply_closure_to_all_completed_buffers() { +void DirtyCardQueueSet::apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl) { BufferNode* nd = _completed_buffers_head; while (nd != NULL) { bool b = - DirtyCardQueue::apply_closure_to_buffer(_closure, + DirtyCardQueue::apply_closure_to_buffer(cl, BufferNode::make_buffer_from_node(nd), 0, _sz, false); guarantee(b, "Should not stop early."); @@ -237,6 +228,24 @@ } } +void DirtyCardQueueSet::par_apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl) { + BufferNode* nd = _cur_par_buffer_node; + while (nd != NULL) { + BufferNode* next = (BufferNode*)nd->next(); + BufferNode* actual = (BufferNode*)Atomic::cmpxchg_ptr((void*)next, (volatile void*)&_cur_par_buffer_node, (void*)nd); + if (actual == nd) { + bool b = + DirtyCardQueue::apply_closure_to_buffer(cl, + BufferNode::make_buffer_from_node(actual), + 0, _sz, false); + guarantee(b, "Should not stop early."); + nd = next; + } else { + nd = actual; + } + } +} + // Deallocates any completed log buffers void DirtyCardQueueSet::clear() { BufferNode* buffers_to_delete = NULL; --- old/src/share/vm/gc_implementation/g1/dirtyCardQueue.hpp 2014-04-15 11:59:20.382844059 +0200 +++ new/src/share/vm/gc_implementation/g1/dirtyCardQueue.hpp 2014-04-15 11:59:20.302844062 +0200 @@ -73,7 +73,8 @@ class DirtyCardQueueSet: public PtrQueueSet { - CardTableEntryClosure* _closure; + // The closure used for + CardTableEntryClosure* _mut_process_closure; DirtyCardQueue _shared_dirty_card_queue; @@ -88,10 +89,12 @@ jint _processed_buffers_mut; jint _processed_buffers_rs_thread; + // Current buffer node used for parallel iteration. + BufferNode* volatile _cur_par_buffer_node; public: DirtyCardQueueSet(bool notify_when_complete = true); - void initialize(Monitor* cbl_mon, Mutex* fl_lock, + void initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock, int process_completed_threshold, int max_completed_queue, Mutex* lock, PtrQueueSet* fl_owner = NULL); @@ -102,33 +105,15 @@ static void handle_zero_index_for_thread(JavaThread* t); - // Register "blk" as "the closure" for all queues. Only one such closure - // is allowed. The "apply_closure_to_completed_buffer" method will apply - // this closure to a completed buffer, and "iterate_closure_all_threads" - // applies it to partially-filled buffers (the latter should only be done - // with the world stopped). - void set_closure(CardTableEntryClosure* closure); - - // If there is a registered closure for buffers, apply it to all entries - // in all currently-active buffers. This should only be applied at a - // safepoint. (Currently must not be called in parallel; this should - // change in the future.) If "consume" is true, processed entries are - // discarded. - void iterate_closure_all_threads(bool consume = true, + // Apply the given closure to all entries in all currently-active buffers. + // This should only be applied at a safepoint. (Currently must not be called + // in parallel; this should change in the future.) If "consume" is true, + // processed entries are discarded. + void iterate_closure_all_threads(CardTableEntryClosure* cl, + bool consume = true, uint worker_i = 0); // If there exists some completed buffer, pop it, then apply the - // registered closure to all its elements, nulling out those elements - // processed. If all elements are processed, returns "true". If no - // completed buffers exist, returns false. If a completed buffer exists, - // but is only partially completed before a "yield" happens, the - // partially completed buffer (with its processed elements set to NULL) - // is returned to the completed buffer set, and this call returns false. - bool apply_closure_to_completed_buffer(uint worker_i = 0, - int stop_at = 0, - bool during_pause = false); - - // If there exists some completed buffer, pop it, then apply the // specified closure to all its elements, nulling out those elements // processed. If all elements are processed, returns "true". If no // completed buffers exist, returns false. If a completed buffer exists, @@ -149,7 +134,12 @@ // Applies the current closure to all completed buffers, // non-consumptively. - void apply_closure_to_all_completed_buffers(); + void apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl); + + void reset_for_par_iteration() { _cur_par_buffer_node = _completed_buffers_head; } + // Applies the current closure to all completed buffers, non-consumptively. + // Parallel version. + void par_apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl); DirtyCardQueue* shared_dirty_card_queue() { return &_shared_dirty_card_queue; --- old/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp 2014-04-15 11:59:20.942844042 +0200 +++ new/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp 2014-04-15 11:59:20.838844045 +0200 @@ -92,16 +92,12 @@ // Local to this file. class RefineCardTableEntryClosure: public CardTableEntryClosure { - G1RemSet* _g1rs; - ConcurrentG1Refine* _cg1r; bool _concurrent; public: - RefineCardTableEntryClosure(G1RemSet* g1rs, - ConcurrentG1Refine* cg1r) : - _g1rs(g1rs), _cg1r(cg1r), _concurrent(true) - {} + RefineCardTableEntryClosure() : _concurrent(true) { } + bool do_card_ptr(jbyte* card_ptr, uint worker_i) { - bool oops_into_cset = _g1rs->refine_card(card_ptr, worker_i, false); + bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false); // This path is executed by the concurrent refine or mutator threads, // concurrently, and so we do not care if card_ptr contains references // that point into the collection set. @@ -114,6 +110,7 @@ // Otherwise, we finished successfully; return true. return true; } + void set_concurrent(bool b) { _concurrent = b; } }; @@ -477,9 +474,8 @@ // First clear the logged cards. ClearLoggedCardTableEntryClosure clear; - dcqs.set_closure(&clear); - dcqs.apply_closure_to_all_completed_buffers(); - dcqs.iterate_closure_all_threads(false); + dcqs.apply_closure_to_all_completed_buffers(&clear); + dcqs.iterate_closure_all_threads(&clear, false); clear.print_histo(); // Now ensure that there's no dirty cards. @@ -492,9 +488,8 @@ guarantee(count2.n() == 0, "Card table should be clean."); RedirtyLoggedCardTableEntryClosure redirty; - JavaThread::dirty_card_queue_set().set_closure(&redirty); - dcqs.apply_closure_to_all_completed_buffers(); - dcqs.iterate_closure_all_threads(false); + dcqs.apply_closure_to_all_completed_buffers(&redirty); + dcqs.iterate_closure_all_threads(&redirty, false); gclog_or_tty->print_cr("Log entries = %d, dirty cards = %d.", clear.calls(), orig_count); guarantee(redirty.calls() == clear.calls(), @@ -507,8 +502,6 @@ orig_count, count3.n()); guarantee(count3.n() >= orig_count, "Should have restored them all."); } - - JavaThread::dirty_card_queue_set().set_closure(_refine_cte_cl); } // Private class members. @@ -2002,7 +1995,9 @@ Universe::check_alignment(max_byte_size, HeapRegion::GrainBytes, "g1 heap"); Universe::check_alignment(max_byte_size, heap_alignment, "g1 heap"); - _cg1r = new ConcurrentG1Refine(this); + _refine_cte_cl = new RefineCardTableEntryClosure(); + + _cg1r = new ConcurrentG1Refine(this, _refine_cte_cl); // Reserve the maximum. @@ -2097,24 +2092,21 @@ // Perform any initialization actions delegated to the policy. g1_policy()->init(); - _refine_cte_cl = - new RefineCardTableEntryClosure(g1_rem_set(), - concurrent_g1_refine()); - JavaThread::dirty_card_queue_set().set_closure(_refine_cte_cl); - JavaThread::satb_mark_queue_set().initialize(SATB_Q_CBL_mon, SATB_Q_FL_lock, G1SATBProcessCompletedThreshold, Shared_SATB_Q_lock); - JavaThread::dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon, + JavaThread::dirty_card_queue_set().initialize(_refine_cte_cl, + DirtyCardQ_CBL_mon, DirtyCardQ_FL_lock, concurrent_g1_refine()->yellow_zone(), concurrent_g1_refine()->red_zone(), Shared_DirtyCardQ_lock); if (G1DeferredRSUpdate) { - dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon, + dirty_card_queue_set().initialize(NULL, // Should never be called by the Java code + DirtyCardQ_CBL_mon, DirtyCardQ_FL_lock, -1, // never trigger processing -1, // no limit on length @@ -2124,7 +2116,8 @@ // Initialize the card queue set used to hold cards containing // references into the collection set. - _into_cset_dirty_card_queue_set.initialize(DirtyCardQ_CBL_mon, + _into_cset_dirty_card_queue_set.initialize(NULL, // Should never be called by the Java code + DirtyCardQ_CBL_mon, DirtyCardQ_FL_lock, -1, // never trigger processing -1, // no limit on length @@ -5262,20 +5255,59 @@ } class RedirtyLoggedCardTableEntryFastClosure : public CardTableEntryClosure { -public: + private: + size_t _num_processed; + + public: + RedirtyLoggedCardTableEntryFastClosure() : CardTableEntryClosure(), _num_processed(0) { } + bool do_card_ptr(jbyte* card_ptr, uint worker_i) { *card_ptr = CardTableModRefBS::dirty_card_val(); + _num_processed++; return true; } + + size_t num_processed() const { return _num_processed; } +}; + +class G1RedirtyLoggedCardsTask : public AbstractGangTask { + private: + DirtyCardQueueSet* _queue; + public: + G1RedirtyLoggedCardsTask(DirtyCardQueueSet* queue) : AbstractGangTask("Redirty Cards"), _queue(queue) { } + + virtual void work(uint worker_id) { + double start_time = os::elapsedTime(); + + RedirtyLoggedCardTableEntryFastClosure cl; + if (G1CollectedHeap::heap()->use_parallel_gc_threads()) { + _queue->par_apply_closure_to_all_completed_buffers(&cl); + } else { + _queue->apply_closure_to_all_completed_buffers(&cl); + } + + G1GCPhaseTimes* timer = G1CollectedHeap::heap()->g1_policy()->phase_times(); + timer->record_redirty_logged_cards_time_ms(worker_id, (os::elapsedTime() - start_time) * 1000.0); + timer->record_redirty_logged_cards_processed_cards(worker_id, cl.num_processed()); + } }; void G1CollectedHeap::redirty_logged_cards() { guarantee(G1DeferredRSUpdate, "Must only be called when using deferred RS updates."); double redirty_logged_cards_start = os::elapsedTime(); - RedirtyLoggedCardTableEntryFastClosure redirty; - dirty_card_queue_set().set_closure(&redirty); - dirty_card_queue_set().apply_closure_to_all_completed_buffers(); + uint n_workers = (G1CollectedHeap::use_parallel_gc_threads() ? + _g1h->workers()->active_workers() : 1); + + G1RedirtyLoggedCardsTask redirty_task(&dirty_card_queue_set()); + dirty_card_queue_set().reset_for_par_iteration(); + if (use_parallel_gc_threads()) { + set_par_threads(n_workers); + workers()->run_task(&redirty_task); + set_par_threads(0); + } else { + redirty_task.work(0); + } DirtyCardQueueSet& dcq = JavaThread::dirty_card_queue_set(); dcq.merge_bufferlists(&dirty_card_queue_set()); --- old/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.cpp 2014-04-15 11:59:21.642844020 +0200 +++ new/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.cpp 2014-04-15 11:59:21.550844023 +0200 @@ -170,6 +170,8 @@ _last_gc_worker_end_times_ms(_max_gc_threads, "%.1lf", false), _last_gc_worker_times_ms(_max_gc_threads, "%.1lf"), _last_gc_worker_other_times_ms(_max_gc_threads, "%.1lf"), + _last_redirty_logged_cards_time_ms(_max_gc_threads, "%.1lf"), + _last_redirty_logged_cards_processed_cards(_max_gc_threads, SIZE_FORMAT), _cur_string_dedup_queue_fixup_worker_times_ms(_max_gc_threads, "%.1lf"), _cur_string_dedup_table_fixup_worker_times_ms(_max_gc_threads, "%.1lf") { @@ -195,6 +197,10 @@ _last_gc_worker_end_times_ms.reset(); _last_gc_worker_times_ms.reset(); _last_gc_worker_other_times_ms.reset(); + + _last_redirty_logged_cards_time_ms.reset(); + _last_redirty_logged_cards_processed_cards.reset(); + } void G1GCPhaseTimes::note_gc_end() { @@ -230,6 +236,9 @@ _last_gc_worker_times_ms.verify(); _last_gc_worker_other_times_ms.verify(); + + _last_redirty_logged_cards_time_ms.verify(); + _last_redirty_logged_cards_processed_cards.verify(); } void G1GCPhaseTimes::note_string_dedup_fixup_start() { @@ -349,6 +358,10 @@ print_stats(2, "Ref Enq", _cur_ref_enq_time_ms); if (G1DeferredRSUpdate) { print_stats(2, "Redirty Cards", _recorded_redirty_logged_cards_time_ms); + if (G1Log::finest()) { + _last_redirty_logged_cards_time_ms.print(3, "Parallel Redirty"); + _last_redirty_logged_cards_processed_cards.print(3, "Redirtied Cards"); + } } print_stats(2, "Free CSet", (_recorded_young_free_cset_time_ms + --- old/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.hpp 2014-04-15 11:59:22.130844004 +0200 +++ new/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.hpp 2014-04-15 11:59:22.042844007 +0200 @@ -151,6 +151,8 @@ double _recorded_young_cset_choice_time_ms; double _recorded_non_young_cset_choice_time_ms; + WorkerDataArray _last_redirty_logged_cards_time_ms; + WorkerDataArray _last_redirty_logged_cards_processed_cards; double _recorded_redirty_logged_cards_time_ms; double _recorded_young_free_cset_time_ms; @@ -293,6 +295,14 @@ _recorded_non_young_cset_choice_time_ms = time_ms; } + void record_redirty_logged_cards_time_ms(uint worker_i, double time_ms) { + _last_redirty_logged_cards_time_ms.set(worker_i, time_ms); + } + + void record_redirty_logged_cards_processed_cards(uint worker_i, size_t processed_buffers) { + _last_redirty_logged_cards_processed_cards.set(worker_i, processed_buffers); + } + void record_redirty_logged_cards_time_ms(double time_ms) { _recorded_redirty_logged_cards_time_ms = time_ms; } --- old/test/gc/g1/TestGCLogMessages.java 2014-04-15 11:59:22.734843985 +0200 +++ new/test/gc/g1/TestGCLogMessages.java 2014-04-15 11:59:22.634843989 +0200 @@ -23,7 +23,7 @@ /* * @test TestPrintGCDetails - * @bug 8035406 8027295 8035398 + * @bug 8035406 8027295 8035398 8019342 * @summary Ensure that the PrintGCDetails output for a minor GC with G1 * includes the expected necessary messages. * @key gc @@ -48,6 +48,8 @@ OutputAnalyzer output = new OutputAnalyzer(pb.start()); output.shouldNotContain("[Redirty Cards"); + output.shouldNotContain("[Parallel Redirty"); + output.shouldNotContain("[Redirtied Cards"); output.shouldNotContain("[Code Root Purge"); output.shouldNotContain("[String Dedup Fixup"); output.shouldNotContain("[Young Free CSet"); @@ -63,6 +65,8 @@ output = new OutputAnalyzer(pb.start()); output.shouldContain("[Redirty Cards"); + output.shouldNotContain("[Parallel Redirty"); + output.shouldNotContain("[Redirtied Cards"); output.shouldContain("[Code Root Purge"); output.shouldContain("[String Dedup Fixup"); output.shouldNotContain("[Young Free CSet"); @@ -80,6 +84,8 @@ output = new OutputAnalyzer(pb.start()); output.shouldContain("[Redirty Cards"); + output.shouldContain("[Parallel Redirty"); + output.shouldContain("[Redirtied Cards"); output.shouldContain("[Code Root Purge"); output.shouldContain("[String Dedup Fixup"); output.shouldContain("[Young Free CSet");