1 /* 2 * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "gc/g1/g1BufferNodeList.hpp" 27 #include "gc/g1/g1CardTableEntryClosure.hpp" 28 #include "gc/g1/g1CollectedHeap.inline.hpp" 29 #include "gc/g1/g1DirtyCardQueue.hpp" 30 #include "gc/g1/g1EpochSynchronizer.hpp" 31 #include "gc/g1/g1FreeIdSet.hpp" 32 #include "gc/g1/g1RedirtyCardsQueue.hpp" 33 #include "gc/g1/g1RemSet.hpp" 34 #include "gc/g1/g1ThreadLocalData.hpp" 35 #include "gc/g1/heapRegionRemSet.hpp" 36 #include "gc/shared/suspendibleThreadSet.hpp" 37 #include "gc/shared/workgroup.hpp" 38 #include "memory/iterator.hpp" 39 #include "runtime/flags/flagSetting.hpp" 40 #include "runtime/mutexLocker.hpp" 41 #include "runtime/orderAccess.hpp" 42 #include "runtime/os.hpp" 43 #include "runtime/safepoint.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "runtime/threadSMR.hpp" 46 #include "utilities/quickSort.hpp" 47 48 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) : 49 // Dirty card queues are always active, so we create them with their 50 // active field set to true. 51 PtrQueue(qset, true /* active */) 52 { } 53 54 G1DirtyCardQueue::~G1DirtyCardQueue() { 55 flush(); 56 } 57 58 void G1DirtyCardQueue::handle_completed_buffer() { 59 assert(_buf != NULL, "precondition"); 60 BufferNode* node = BufferNode::make_node_from_buffer(_buf, index()); 61 G1DirtyCardQueueSet* dcqs = dirty_card_qset(); 62 if (dcqs->process_or_enqueue_completed_buffer(node)) { 63 reset(); // Buffer fully processed, reset index. 64 } else { 65 allocate_buffer(); // Buffer enqueued, get a new one. 66 } 67 } 68 69 // Assumed to be zero by concurrent threads. 70 static uint par_ids_start() { return 0; } 71 72 G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon, 73 BufferNode::Allocator* allocator) : 74 PtrQueueSet(allocator), 75 _cbl_mon(cbl_mon), 76 _completed_buffers_head(NULL), 77 _completed_buffers_tail(NULL), 78 _num_cards(0), 79 _process_cards_threshold(ProcessCardsThresholdNever), 80 _process_completed_buffers(false), 81 _max_cards(MaxCardsUnlimited), 82 _max_cards_padding(0), 83 _free_ids(par_ids_start(), num_par_ids()), 84 _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC)) 85 { 86 ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t)); 87 _all_active = true; 88 } 89 90 G1DirtyCardQueueSet::~G1DirtyCardQueueSet() { 91 abandon_completed_buffers(); 92 FREE_C_HEAP_ARRAY(size_t, _mutator_refined_cards_counters); 93 } 94 95 // Determines how many mutator threads can process the buffers in parallel. 96 uint G1DirtyCardQueueSet::num_par_ids() { 97 return (uint)os::initial_active_processor_count(); 98 } 99 100 size_t G1DirtyCardQueueSet::total_mutator_refined_cards() const { 101 size_t sum = 0; 102 for (uint i = 0; i < num_par_ids(); ++i) { 103 sum += _mutator_refined_cards_counters[i]; 104 } 105 return sum; 106 } 107 108 void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) { 109 G1ThreadLocalData::dirty_card_queue(t).handle_zero_index(); 110 } 111 112 void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) { 113 MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag); 114 cbn->set_next(NULL); 115 if (_completed_buffers_tail == NULL) { 116 assert(_completed_buffers_head == NULL, "Well-formedness"); 117 _completed_buffers_head = cbn; 118 _completed_buffers_tail = cbn; 119 } else { 120 _completed_buffers_tail->set_next(cbn); 121 _completed_buffers_tail = cbn; 122 } 123 _num_cards += buffer_size() - cbn->index(); 124 125 if (!process_completed_buffers() && 126 (num_cards() > process_cards_threshold())) { 127 set_process_completed_buffers(true); 128 ml.notify_all(); 129 } 130 verify_num_cards(); 131 } 132 133 BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) { 134 MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); 135 136 if (num_cards() <= stop_at) { 137 return NULL; 138 } 139 140 assert(num_cards() > 0, "invariant"); 141 assert(_completed_buffers_head != NULL, "invariant"); 142 assert(_completed_buffers_tail != NULL, "invariant"); 143 144 BufferNode* bn = _completed_buffers_head; 145 _num_cards -= buffer_size() - bn->index(); 146 _completed_buffers_head = bn->next(); 147 if (_completed_buffers_head == NULL) { 148 assert(num_cards() == 0, "invariant"); 149 _completed_buffers_tail = NULL; 150 set_process_completed_buffers(false); 151 } 152 verify_num_cards(); 153 bn->set_next(NULL); 154 return bn; 155 } 156 157 #ifdef ASSERT 158 void G1DirtyCardQueueSet::verify_num_cards() const { 159 size_t actual = 0; 160 BufferNode* cur = _completed_buffers_head; 161 while (cur != NULL) { 162 actual += buffer_size() - cur->index(); 163 cur = cur->next(); 164 } 165 assert(actual == _num_cards, 166 "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT, 167 _num_cards, actual); 168 } 169 #endif 170 171 void G1DirtyCardQueueSet::abandon_completed_buffers() { 172 BufferNode* buffers_to_delete = NULL; 173 { 174 MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); 175 buffers_to_delete = _completed_buffers_head; 176 _completed_buffers_head = NULL; 177 _completed_buffers_tail = NULL; 178 _num_cards = 0; 179 set_process_completed_buffers(false); 180 } 181 while (buffers_to_delete != NULL) { 182 BufferNode* bn = buffers_to_delete; 183 buffers_to_delete = bn->next(); 184 bn->set_next(NULL); 185 deallocate_buffer(bn); 186 } 187 } 188 189 void G1DirtyCardQueueSet::notify_if_necessary() { 190 MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag); 191 if (num_cards() > process_cards_threshold()) { 192 set_process_completed_buffers(true); 193 ml.notify_all(); 194 } 195 } 196 197 // Merge lists of buffers. Notify the processing threads. 198 // The source queue is emptied as a result. The queues 199 // must share the monitor. 200 void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) { 201 assert(allocator() == src->allocator(), "precondition"); 202 const G1BufferNodeList from = src->take_all_completed_buffers(); 203 if (from._head == NULL) return; 204 205 MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); 206 if (_completed_buffers_tail == NULL) { 207 assert(_completed_buffers_head == NULL, "Well-formedness"); 208 _completed_buffers_head = from._head; 209 _completed_buffers_tail = from._tail; 210 } else { 211 assert(_completed_buffers_head != NULL, "Well formedness"); 212 _completed_buffers_tail->set_next(from._head); 213 _completed_buffers_tail = from._tail; 214 } 215 _num_cards += from._entry_count; 216 217 assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL || 218 _completed_buffers_head != NULL && _completed_buffers_tail != NULL, 219 "Sanity"); 220 verify_num_cards(); 221 } 222 223 G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() { 224 MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); 225 G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards); 226 _completed_buffers_head = NULL; 227 _completed_buffers_tail = NULL; 228 _num_cards = 0; 229 return result; 230 } 231 232 class G1RefineBufferedCards : public StackObj { 233 BufferNode* const _node; 234 CardTable::CardValue** const _node_buffer; 235 const size_t _node_buffer_size; 236 const uint _worker_id; 237 size_t* _total_refined_cards; 238 G1RemSet* const _g1rs; 239 // TODO: Remove _dcqs when G1TestEpochSyncInConcRefinement executes refine_cleaned_cards() 240 // after the try_synchronize() loop. 241 G1DirtyCardQueueSet* const _dcqs; 242 243 static inline int compare_card(const CardTable::CardValue* p1, 244 const CardTable::CardValue* p2) { 245 return p2 - p1; 246 } 247 248 // Sorts the cards from start_index to _node_buffer_size in *decreasing* 249 // address order. Tests showed that this order is preferable to not sorting 250 // or increasing address order. 251 void sort_cards(size_t start_index) { 252 QuickSort::sort(&_node_buffer[start_index], 253 _node_buffer_size - start_index, 254 compare_card, 255 false); 256 } 257 258 // Returns the index to the first clean card in the buffer. 259 size_t clean_cards() { 260 const size_t start = _node->index(); 261 assert(start <= _node_buffer_size, "invariant"); 262 263 // Two-fingered compaction algorithm similar to the filtering mechanism in 264 // SATBMarkQueue. The main difference is that clean_card_before_refine() 265 // could change the buffer element in-place. 266 // We don't check for SuspendibleThreadSet::should_yield(), because 267 // cleaning and redirtying the cards is fast. 268 CardTable::CardValue** src = &_node_buffer[start]; 269 CardTable::CardValue** dst = &_node_buffer[_node_buffer_size]; 270 assert(src <= dst, "invariant"); 271 for ( ; src < dst; ++src) { 272 // Search low to high for a card to keep. 273 if (_g1rs->clean_card_before_refine(src)) { 274 // Found keeper. Search high to low for a card to discard. 275 while (src < --dst) { 276 if (!_g1rs->clean_card_before_refine(dst)) { 277 *dst = *src; // Replace discard with keeper. 278 break; 279 } 280 } 281 // If discard search failed (src == dst), the outer loop will also end. 282 } 283 } 284 285 // dst points to the first retained clean card, or the end of the buffer 286 // if all the cards were discarded. 287 const size_t first_clean = dst - _node_buffer; 288 assert(first_clean >= start && first_clean <= _node_buffer_size, "invariant"); 289 // Discarded cards are considered as refined. 290 *_total_refined_cards += first_clean - start; 291 return first_clean; 292 } 293 294 bool refine_cleaned_cards(size_t start_index) { 295 bool result = true; 296 size_t i = start_index; 297 for ( ; i < _node_buffer_size; ++i) { 298 if (SuspendibleThreadSet::should_yield()) { 299 redirty_unrefined_cards(i); 300 result = false; 301 break; 302 } 303 _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id); 304 } 305 _node->set_index(i); 306 *_total_refined_cards += i - start_index; 307 return result; 308 } 309 310 void redirty_unrefined_cards(size_t start) { 311 for ( ; start < _node_buffer_size; ++start) { 312 *_node_buffer[start] = G1CardTable::dirty_card_val(); 313 } 314 } 315 316 public: 317 G1RefineBufferedCards(BufferNode* node, 318 size_t node_buffer_size, 319 uint worker_id, 320 size_t* total_refined_cards, 321 G1DirtyCardQueueSet* dcqs) : 322 _node(node), 323 _node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))), 324 _node_buffer_size(node_buffer_size), 325 _worker_id(worker_id), 326 _total_refined_cards(total_refined_cards), 327 _g1rs(G1CollectedHeap::heap()->rem_set()), 328 _dcqs(dcqs) {} 329 330 bool refine() { 331 size_t first_clean_index = clean_cards(); 332 if (first_clean_index == _node_buffer_size) { 333 _node->set_index(first_clean_index); 334 return true; 335 } 336 // This fence serves two purposes. First, the cards must be cleaned 337 // before processing the contents. Second, we can't proceed with 338 // processing a region until after the read of the region's top in 339 // collect_and_clean_cards(), for synchronization with possibly concurrent 340 // humongous object allocation (see comment at the StoreStore fence before 341 // setting the regions' tops in humongous allocation path). 342 // It's okay that reading region's top and reading region's type were racy 343 // wrto each other. We need both set, in any order, to proceed. 344 OrderAccess::fence(); 345 346 if (G1TestEpochSyncInConcRefinement && !Thread::current()->is_Java_thread()) { 347 // TODO: Asynchronously execute epoch synchronization for multiple 348 // node buffers. This could be done by calling start_synchronizing() for 349 // multiple node buffers, and associating each required frontier with the 350 // buffer. Then execute sort_cards() for these buffers. Finally call 351 // try_synchronize() for each of these node buffers with the recorded 352 // required frontier. 353 G1EpochSynchronizer syncer; 354 ResourceMark rm; // For retrieving thread names in log messages. 355 syncer.start_synchronizing(); 356 jlong start_counter = os::elapsed_counter(); 357 358 // Spend some time doing useful work instead of blindly waiting. 359 sort_cards(first_clean_index); 360 361 // TODO: refine_cleaned_cards() should be called AFTER the try_synchronize() 362 // loop below. However, we should redirty unrefined cards and skip refinement 363 // work if try_synchronize() spans across a safepoint. 364 // See the TODO comment in try_synchronize(). 365 bool result = refine_cleaned_cards(first_clean_index); 366 if (!result) { 367 // We need to enqueue partially processed cards before the try_synchronize() loop, 368 // which could spans across a safepoint. 369 _dcqs->enqueue_completed_buffer(_node); 370 } 371 372 const jlong increase_urgency_thres_ns = 3 * NANOSECS_PER_MILLISEC; // 3 millis 373 const char* thread_name = Thread::current()->name(); 374 bool synced = false; 375 bool high_urgency = false; 376 // The first call to try_synchronize() does not need high urgency. 377 while (!syncer.try_synchronize()) { 378 jlong elapsed_counter = os::elapsed_counter() - start_counter; 379 if (!high_urgency && elapsed_counter > increase_urgency_thres_ns) { 380 high_urgency = true; 381 syncer.increase_urgency(); 382 } 383 }; 384 log_debug(gc, refine, handshake)("%s: Epoch synced after %.3f ms", 385 thread_name, TimeHelper::counter_to_millis(os::elapsed_counter() - start_counter)); 386 return result; 387 } else { 388 sort_cards(first_clean_index); 389 return refine_cleaned_cards(first_clean_index); 390 } 391 } 392 }; 393 394 bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node, 395 uint worker_id, 396 size_t* total_refined_cards) { 397 G1RefineBufferedCards buffered_cards(node, 398 buffer_size(), 399 worker_id, 400 total_refined_cards, 401 this); 402 return buffered_cards.refine(); 403 } 404 405 #ifndef ASSERT 406 #define assert_fully_consumed(node, buffer_size) 407 #else 408 #define assert_fully_consumed(node, buffer_size) \ 409 do { \ 410 size_t _afc_index = (node)->index(); \ 411 size_t _afc_size = (buffer_size); \ 412 assert(_afc_index == _afc_size, \ 413 "Buffer was not fully consumed as claimed: index: " \ 414 SIZE_FORMAT ", size: " SIZE_FORMAT, \ 415 _afc_index, _afc_size); \ 416 } while (0) 417 #endif // ASSERT 418 419 bool G1DirtyCardQueueSet::process_or_enqueue_completed_buffer(BufferNode* node) { 420 if (Thread::current()->is_Java_thread()) { 421 // If the number of buffers exceeds the limit, make this Java 422 // thread do the processing itself. We don't lock to access 423 // buffer count or padding; it is fine to be imprecise here. The 424 // add of padding could overflow, which is treated as unlimited. 425 size_t limit = max_cards() + max_cards_padding(); 426 if ((num_cards() > limit) && (limit >= max_cards())) { 427 if (mut_process_buffer(node)) { 428 return true; 429 } 430 } 431 } 432 enqueue_completed_buffer(node); 433 return false; 434 } 435 436 bool G1DirtyCardQueueSet::mut_process_buffer(BufferNode* node) { 437 uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id 438 uint counter_index = worker_id - par_ids_start(); 439 size_t* counter = &_mutator_refined_cards_counters[counter_index]; 440 bool result = refine_buffer(node, worker_id, counter); 441 _free_ids.release_par_id(worker_id); // release the id 442 443 if (result) { 444 assert_fully_consumed(node, buffer_size()); 445 } 446 return result; 447 } 448 449 bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id, 450 size_t stop_at, 451 size_t* total_refined_cards) { 452 BufferNode* node = get_completed_buffer(stop_at); 453 if (node == NULL) { 454 return false; 455 } else if (refine_buffer(node, worker_id, total_refined_cards)) { 456 assert_fully_consumed(node, buffer_size()); 457 // Done with fully processed buffer. 458 deallocate_buffer(node); 459 return true; 460 } else { 461 if (!G1TestEpochSyncInConcRefinement) { 462 // Return partially processed buffer to the queue. 463 enqueue_completed_buffer(node); 464 } 465 return true; 466 } 467 } 468 469 void G1DirtyCardQueueSet::abandon_logs() { 470 assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint."); 471 abandon_completed_buffers(); 472 473 // Since abandon is done only at safepoints, we can safely manipulate 474 // these queues. 475 struct AbandonThreadLogClosure : public ThreadClosure { 476 virtual void do_thread(Thread* t) { 477 G1ThreadLocalData::dirty_card_queue(t).reset(); 478 } 479 } closure; 480 Threads::threads_do(&closure); 481 482 G1BarrierSet::shared_dirty_card_queue().reset(); 483 } 484 485 void G1DirtyCardQueueSet::concatenate_logs() { 486 // Iterate over all the threads, if we find a partial log add it to 487 // the global list of logs. Temporarily turn off the limit on the number 488 // of outstanding buffers. 489 assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint."); 490 size_t old_limit = max_cards(); 491 set_max_cards(MaxCardsUnlimited); 492 493 struct ConcatenateThreadLogClosure : public ThreadClosure { 494 virtual void do_thread(Thread* t) { 495 G1DirtyCardQueue& dcq = G1ThreadLocalData::dirty_card_queue(t); 496 if (!dcq.is_empty()) { 497 dcq.flush(); 498 } 499 } 500 } closure; 501 Threads::threads_do(&closure); 502 503 G1BarrierSet::shared_dirty_card_queue().flush(); 504 set_max_cards(old_limit); 505 }