1 /*
   2  * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "gc/g1/g1BufferNodeList.hpp"
  27 #include "gc/g1/g1CardTableEntryClosure.hpp"
  28 #include "gc/g1/g1CollectedHeap.inline.hpp"
  29 #include "gc/g1/g1DirtyCardQueue.hpp"
  30 #include "gc/g1/g1EpochSynchronizer.hpp"
  31 #include "gc/g1/g1FreeIdSet.hpp"
  32 #include "gc/g1/g1RedirtyCardsQueue.hpp"
  33 #include "gc/g1/g1RemSet.hpp"
  34 #include "gc/g1/g1ThreadLocalData.hpp"
  35 #include "gc/g1/heapRegionRemSet.hpp"
  36 #include "gc/shared/suspendibleThreadSet.hpp"
  37 #include "gc/shared/workgroup.hpp"
  38 #include "memory/iterator.hpp"
  39 #include "runtime/flags/flagSetting.hpp"
  40 #include "runtime/mutexLocker.hpp"
  41 #include "runtime/orderAccess.hpp"
  42 #include "runtime/os.hpp"
  43 #include "runtime/safepoint.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "runtime/threadSMR.hpp"
  46 #include "utilities/quickSort.hpp"
  47 
  48 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
  49   // Dirty card queues are always active, so we create them with their
  50   // active field set to true.
  51   PtrQueue(qset, true /* active */)
  52 { }
  53 
  54 G1DirtyCardQueue::~G1DirtyCardQueue() {
  55   flush();
  56 }
  57 
  58 void G1DirtyCardQueue::handle_completed_buffer() {
  59   assert(_buf != NULL, "precondition");
  60   BufferNode* node = BufferNode::make_node_from_buffer(_buf, index());
  61   G1DirtyCardQueueSet* dcqs = dirty_card_qset();
  62   if (dcqs->process_or_enqueue_completed_buffer(node)) {
  63     reset();                    // Buffer fully processed, reset index.
  64   } else {
  65     allocate_buffer();          // Buffer enqueued, get a new one.
  66   }
  67 }
  68 
  69 // Assumed to be zero by concurrent threads.
  70 static uint par_ids_start() { return 0; }
  71 
  72 G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon,
  73                                          BufferNode::Allocator* allocator) :
  74   PtrQueueSet(allocator),
  75   _cbl_mon(cbl_mon),
  76   _completed_buffers_head(NULL),
  77   _completed_buffers_tail(NULL),
  78   _num_cards(0),
  79   _process_cards_threshold(ProcessCardsThresholdNever),
  80   _process_completed_buffers(false),
  81   _max_cards(MaxCardsUnlimited),
  82   _max_cards_padding(0),
  83   _free_ids(par_ids_start(), num_par_ids()),
  84   _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC))
  85 {
  86   ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t));
  87   _all_active = true;
  88 }
  89 
  90 G1DirtyCardQueueSet::~G1DirtyCardQueueSet() {
  91   abandon_completed_buffers();
  92   FREE_C_HEAP_ARRAY(size_t, _mutator_refined_cards_counters);
  93 }
  94 
  95 // Determines how many mutator threads can process the buffers in parallel.
  96 uint G1DirtyCardQueueSet::num_par_ids() {
  97   return (uint)os::initial_active_processor_count();
  98 }
  99 
 100 size_t G1DirtyCardQueueSet::total_mutator_refined_cards() const {
 101   size_t sum = 0;
 102   for (uint i = 0; i < num_par_ids(); ++i) {
 103     sum += _mutator_refined_cards_counters[i];
 104   }
 105   return sum;
 106 }
 107 
 108 void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
 109   G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 110 }
 111 
 112 void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
 113   MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
 114   cbn->set_next(NULL);
 115   if (_completed_buffers_tail == NULL) {
 116     assert(_completed_buffers_head == NULL, "Well-formedness");
 117     _completed_buffers_head = cbn;
 118     _completed_buffers_tail = cbn;
 119   } else {
 120     _completed_buffers_tail->set_next(cbn);
 121     _completed_buffers_tail = cbn;
 122   }
 123   _num_cards += buffer_size() - cbn->index();
 124 
 125   if (!process_completed_buffers() &&
 126       (num_cards() > process_cards_threshold())) {
 127     set_process_completed_buffers(true);
 128     ml.notify_all();
 129   }
 130   verify_num_cards();
 131 }
 132 
 133 BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) {
 134   MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
 135 
 136   if (num_cards() <= stop_at) {
 137     return NULL;
 138   }
 139 
 140   assert(num_cards() > 0, "invariant");
 141   assert(_completed_buffers_head != NULL, "invariant");
 142   assert(_completed_buffers_tail != NULL, "invariant");
 143 
 144   BufferNode* bn = _completed_buffers_head;
 145   _num_cards -= buffer_size() - bn->index();
 146   _completed_buffers_head = bn->next();
 147   if (_completed_buffers_head == NULL) {
 148     assert(num_cards() == 0, "invariant");
 149     _completed_buffers_tail = NULL;
 150     set_process_completed_buffers(false);
 151   }
 152   verify_num_cards();
 153   bn->set_next(NULL);
 154   return bn;
 155 }
 156 
 157 #ifdef ASSERT
 158 void G1DirtyCardQueueSet::verify_num_cards() const {
 159   size_t actual = 0;
 160   BufferNode* cur = _completed_buffers_head;
 161   while (cur != NULL) {
 162     actual += buffer_size() - cur->index();
 163     cur = cur->next();
 164   }
 165   assert(actual == _num_cards,
 166          "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT,
 167          _num_cards, actual);
 168 }
 169 #endif
 170 
 171 void G1DirtyCardQueueSet::abandon_completed_buffers() {
 172   BufferNode* buffers_to_delete = NULL;
 173   {
 174     MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
 175     buffers_to_delete = _completed_buffers_head;
 176     _completed_buffers_head = NULL;
 177     _completed_buffers_tail = NULL;
 178     _num_cards = 0;
 179     set_process_completed_buffers(false);
 180   }
 181   while (buffers_to_delete != NULL) {
 182     BufferNode* bn = buffers_to_delete;
 183     buffers_to_delete = bn->next();
 184     bn->set_next(NULL);
 185     deallocate_buffer(bn);
 186   }
 187 }
 188 
 189 void G1DirtyCardQueueSet::notify_if_necessary() {
 190   MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
 191   if (num_cards() > process_cards_threshold()) {
 192     set_process_completed_buffers(true);
 193     ml.notify_all();
 194   }
 195 }
 196 
 197 // Merge lists of buffers. Notify the processing threads.
 198 // The source queue is emptied as a result. The queues
 199 // must share the monitor.
 200 void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) {
 201   assert(allocator() == src->allocator(), "precondition");
 202   const G1BufferNodeList from = src->take_all_completed_buffers();
 203   if (from._head == NULL) return;
 204 
 205   MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
 206   if (_completed_buffers_tail == NULL) {
 207     assert(_completed_buffers_head == NULL, "Well-formedness");
 208     _completed_buffers_head = from._head;
 209     _completed_buffers_tail = from._tail;
 210   } else {
 211     assert(_completed_buffers_head != NULL, "Well formedness");
 212     _completed_buffers_tail->set_next(from._head);
 213     _completed_buffers_tail = from._tail;
 214   }
 215   _num_cards += from._entry_count;
 216 
 217   assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
 218          _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
 219          "Sanity");
 220   verify_num_cards();
 221 }
 222 
 223 G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
 224   MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
 225   G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards);
 226   _completed_buffers_head = NULL;
 227   _completed_buffers_tail = NULL;
 228   _num_cards = 0;
 229   return result;
 230 }
 231 
 232 class G1RefineBufferedCards : public StackObj {
 233   BufferNode* const _node;
 234   CardTable::CardValue** const _node_buffer;
 235   const size_t _node_buffer_size;
 236   const uint _worker_id;
 237   size_t* _total_refined_cards;
 238   G1RemSet* const _g1rs;
 239   // TODO: Remove _dcqs when G1TestEpochSyncInConcRefinement executes refine_cleaned_cards()
 240   // after the try_synchronize() loop.
 241   G1DirtyCardQueueSet* const _dcqs;
 242 
 243   static inline int compare_card(const CardTable::CardValue* p1,
 244                                  const CardTable::CardValue* p2) {
 245     return p2 - p1;
 246   }
 247 
 248   // Sorts the cards from start_index to _node_buffer_size in *decreasing*
 249   // address order. Tests showed that this order is preferable to not sorting
 250   // or increasing address order.
 251   void sort_cards(size_t start_index) {
 252     QuickSort::sort(&_node_buffer[start_index],
 253                     _node_buffer_size - start_index,
 254                     compare_card,
 255                     false);
 256   }
 257 
 258   // Returns the index to the first clean card in the buffer.
 259   size_t clean_cards() {
 260     const size_t start = _node->index();
 261     assert(start <= _node_buffer_size, "invariant");
 262 
 263     // Two-fingered compaction algorithm similar to the filtering mechanism in
 264     // SATBMarkQueue. The main difference is that clean_card_before_refine()
 265     // could change the buffer element in-place.
 266     // We don't check for SuspendibleThreadSet::should_yield(), because
 267     // cleaning and redirtying the cards is fast.
 268     CardTable::CardValue** src = &_node_buffer[start];
 269     CardTable::CardValue** dst = &_node_buffer[_node_buffer_size];
 270     assert(src <= dst, "invariant");
 271     for ( ; src < dst; ++src) {
 272       // Search low to high for a card to keep.
 273       if (_g1rs->clean_card_before_refine(src)) {
 274         // Found keeper.  Search high to low for a card to discard.
 275         while (src < --dst) {
 276           if (!_g1rs->clean_card_before_refine(dst)) {
 277             *dst = *src;         // Replace discard with keeper.
 278             break;
 279           }
 280         }
 281         // If discard search failed (src == dst), the outer loop will also end.
 282       }
 283     }
 284 
 285     // dst points to the first retained clean card, or the end of the buffer
 286     // if all the cards were discarded.
 287     const size_t first_clean = dst - _node_buffer;
 288     assert(first_clean >= start && first_clean <= _node_buffer_size, "invariant");
 289     // Discarded cards are considered as refined.
 290     *_total_refined_cards += first_clean - start;
 291     return first_clean;
 292   }
 293 
 294   bool refine_cleaned_cards(size_t start_index) {
 295     bool result = true;
 296     size_t i = start_index;
 297     for ( ; i < _node_buffer_size; ++i) {
 298       if (SuspendibleThreadSet::should_yield()) {
 299         redirty_unrefined_cards(i);
 300         result = false;
 301         break;
 302       }
 303       _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id);
 304     }
 305     _node->set_index(i);
 306     *_total_refined_cards += i - start_index;
 307     return result;
 308   }
 309 
 310   void redirty_unrefined_cards(size_t start) {
 311     for ( ; start < _node_buffer_size; ++start) {
 312       *_node_buffer[start] = G1CardTable::dirty_card_val();
 313     }
 314   }
 315 
 316 public:
 317   G1RefineBufferedCards(BufferNode* node,
 318                         size_t node_buffer_size,
 319                         uint worker_id,
 320                         size_t* total_refined_cards,
 321                         G1DirtyCardQueueSet* dcqs) :
 322     _node(node),
 323     _node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))),
 324     _node_buffer_size(node_buffer_size),
 325     _worker_id(worker_id),
 326     _total_refined_cards(total_refined_cards),
 327     _g1rs(G1CollectedHeap::heap()->rem_set()),
 328     _dcqs(dcqs) {}
 329 
 330   bool refine() {
 331     size_t first_clean_index = clean_cards();
 332     if (first_clean_index == _node_buffer_size) {
 333       _node->set_index(first_clean_index);
 334       return true;
 335     }
 336     // This fence serves two purposes. First, the cards must be cleaned
 337     // before processing the contents. Second, we can't proceed with
 338     // processing a region until after the read of the region's top in
 339     // collect_and_clean_cards(), for synchronization with possibly concurrent
 340     // humongous object allocation (see comment at the StoreStore fence before
 341     // setting the regions' tops in humongous allocation path).
 342     // It's okay that reading region's top and reading region's type were racy
 343     // wrto each other. We need both set, in any order, to proceed.
 344     OrderAccess::fence();
 345 
 346     if (G1TestEpochSyncInConcRefinement && !Thread::current()->is_Java_thread()) {
 347       // TODO: Asynchronously execute epoch synchronization for multiple
 348       // node buffers. This could be done by calling start_synchronizing() for
 349       // multiple node buffers, and associating each required frontier with the
 350       // buffer. Then execute sort_cards() for these buffers. Finally call
 351       // try_synchronize() for each of these node buffers with the recorded
 352       // required frontier.
 353       G1EpochSynchronizer syncer;
 354       ResourceMark rm; // For retrieving thread names in log messages.
 355       syncer.start_synchronizing();
 356       jlong start_counter = os::elapsed_counter();
 357 
 358       // Spend some time doing useful work instead of blindly waiting.
 359       sort_cards(first_clean_index);
 360 
 361       // TODO: refine_cleaned_cards() should be called AFTER the try_synchronize()
 362       // loop below. However, we should redirty unrefined cards and skip refinement
 363       // work if try_synchronize() spans across a safepoint.
 364       // See the TODO comment in try_synchronize().
 365       bool result = refine_cleaned_cards(first_clean_index);
 366       if (!result) {
 367         // We need to enqueue partially processed cards before the try_synchronize() loop,
 368         // which could spans across a safepoint.
 369         _dcqs->enqueue_completed_buffer(_node);
 370       }
 371 
 372       const jlong increase_urgency_thres_ns = 3 * NANOSECS_PER_MILLISEC;  // 3 millis
 373       const char* thread_name = Thread::current()->name();
 374       bool synced = false;
 375       bool high_urgency = false;
 376       // The first call to try_synchronize() does not need high urgency.
 377       while (!syncer.try_synchronize()) {
 378         jlong elapsed_counter = os::elapsed_counter() - start_counter;
 379         if (!high_urgency && elapsed_counter > increase_urgency_thres_ns) {
 380           high_urgency = true;
 381           syncer.increase_urgency();
 382         }
 383       };
 384       log_debug(gc, refine, handshake)("%s: Epoch synced after %.3f ms",
 385           thread_name, TimeHelper::counter_to_millis(os::elapsed_counter() - start_counter));
 386       return result;
 387     } else {
 388       sort_cards(first_clean_index);
 389       return refine_cleaned_cards(first_clean_index);
 390     }
 391   }
 392 };
 393 
 394 bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
 395                                         uint worker_id,
 396                                         size_t* total_refined_cards) {
 397   G1RefineBufferedCards buffered_cards(node,
 398                                        buffer_size(),
 399                                        worker_id,
 400                                        total_refined_cards,
 401                                        this);
 402   return buffered_cards.refine();
 403 }
 404 
 405 #ifndef ASSERT
 406 #define assert_fully_consumed(node, buffer_size)
 407 #else
 408 #define assert_fully_consumed(node, buffer_size)                \
 409   do {                                                          \
 410     size_t _afc_index = (node)->index();                        \
 411     size_t _afc_size = (buffer_size);                           \
 412     assert(_afc_index == _afc_size,                             \
 413            "Buffer was not fully consumed as claimed: index: "  \
 414            SIZE_FORMAT ", size: " SIZE_FORMAT,                  \
 415             _afc_index, _afc_size);                             \
 416   } while (0)
 417 #endif // ASSERT
 418 
 419 bool G1DirtyCardQueueSet::process_or_enqueue_completed_buffer(BufferNode* node) {
 420   if (Thread::current()->is_Java_thread()) {
 421     // If the number of buffers exceeds the limit, make this Java
 422     // thread do the processing itself.  We don't lock to access
 423     // buffer count or padding; it is fine to be imprecise here.  The
 424     // add of padding could overflow, which is treated as unlimited.
 425     size_t limit = max_cards() + max_cards_padding();
 426     if ((num_cards() > limit) && (limit >= max_cards())) {
 427       if (mut_process_buffer(node)) {
 428         return true;
 429       }
 430     }
 431   }
 432   enqueue_completed_buffer(node);
 433   return false;
 434 }
 435 
 436 bool G1DirtyCardQueueSet::mut_process_buffer(BufferNode* node) {
 437   uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id
 438   uint counter_index = worker_id - par_ids_start();
 439   size_t* counter = &_mutator_refined_cards_counters[counter_index];
 440   bool result = refine_buffer(node, worker_id, counter);
 441   _free_ids.release_par_id(worker_id); // release the id
 442 
 443   if (result) {
 444     assert_fully_consumed(node, buffer_size());
 445   }
 446   return result;
 447 }
 448 
 449 bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
 450                                                                size_t stop_at,
 451                                                                size_t* total_refined_cards) {
 452   BufferNode* node = get_completed_buffer(stop_at);
 453   if (node == NULL) {
 454     return false;
 455   } else if (refine_buffer(node, worker_id, total_refined_cards)) {
 456     assert_fully_consumed(node, buffer_size());
 457     // Done with fully processed buffer.
 458     deallocate_buffer(node);
 459     return true;
 460   } else {
 461     if (!G1TestEpochSyncInConcRefinement) {
 462       // Return partially processed buffer to the queue.
 463       enqueue_completed_buffer(node);
 464     }
 465     return true;
 466   }
 467 }
 468 
 469 void G1DirtyCardQueueSet::abandon_logs() {
 470   assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
 471   abandon_completed_buffers();
 472 
 473   // Since abandon is done only at safepoints, we can safely manipulate
 474   // these queues.
 475   struct AbandonThreadLogClosure : public ThreadClosure {
 476     virtual void do_thread(Thread* t) {
 477       G1ThreadLocalData::dirty_card_queue(t).reset();
 478     }
 479   } closure;
 480   Threads::threads_do(&closure);
 481 
 482   G1BarrierSet::shared_dirty_card_queue().reset();
 483 }
 484 
 485 void G1DirtyCardQueueSet::concatenate_logs() {
 486   // Iterate over all the threads, if we find a partial log add it to
 487   // the global list of logs.  Temporarily turn off the limit on the number
 488   // of outstanding buffers.
 489   assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
 490   size_t old_limit = max_cards();
 491   set_max_cards(MaxCardsUnlimited);
 492 
 493   struct ConcatenateThreadLogClosure : public ThreadClosure {
 494     virtual void do_thread(Thread* t) {
 495       G1DirtyCardQueue& dcq = G1ThreadLocalData::dirty_card_queue(t);
 496       if (!dcq.is_empty()) {
 497         dcq.flush();
 498       }
 499     }
 500   } closure;
 501   Threads::threads_do(&closure);
 502 
 503   G1BarrierSet::shared_dirty_card_queue().flush();
 504   set_max_cards(old_limit);
 505 }