1 /* 2 * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "gc_implementation/g1/concurrentG1Refine.hpp" 27 #include "gc_implementation/g1/concurrentG1RefineThread.hpp" 28 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp" 29 #include "gc_implementation/g1/g1CollectorPolicy.hpp" 30 #include "gc_implementation/g1/g1GCPhaseTimes.hpp" 31 #include "gc_implementation/g1/g1RemSet.hpp" 32 #include "gc_implementation/g1/heapRegionSeq.inline.hpp" 33 #include "memory/space.inline.hpp" 34 #include "runtime/atomic.hpp" 35 #include "runtime/java.hpp" 36 #include "utilities/copy.hpp" 37 38 // Possible sizes for the card counts cache: odd primes that roughly double in size. 39 // (See jvmtiTagMap.cpp). 40 41 #define MAX_SIZE ((size_t) -1) 42 43 size_t ConcurrentG1Refine::_cc_cache_sizes[] = { 44 16381, 32771, 76831, 150001, 307261, 45 614563, 1228891, 2457733, 4915219, 9830479, 46 19660831, 39321619, 78643219, 157286461, MAX_SIZE 47 }; 48 49 ConcurrentG1Refine::ConcurrentG1Refine() : 50 _card_counts(NULL), _card_epochs(NULL), 51 _n_card_counts(0), _max_cards(0), _max_n_card_counts(0), 52 _cache_size_index(0), _expand_card_counts(false), 53 _hot_cache(NULL), 54 _def_use_cache(false), _use_cache(false), 55 // We initialize the epochs of the array to 0. By initializing 56 // _n_periods to 1 and not 0 we automatically invalidate all the 57 // entries on the array. Otherwise we might accidentally think that 58 // we claimed a card that was in fact never set (see CR7033292). 59 _n_periods(1), 60 _threads(NULL), _n_threads(0) 61 { 62 63 // Ergomonically select initial concurrent refinement parameters 64 if (FLAG_IS_DEFAULT(G1ConcRefinementGreenZone)) { 65 FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, MAX2<int>(ParallelGCThreads, 1)); 66 } 67 set_green_zone(G1ConcRefinementGreenZone); 68 69 if (FLAG_IS_DEFAULT(G1ConcRefinementYellowZone)) { 70 FLAG_SET_DEFAULT(G1ConcRefinementYellowZone, green_zone() * 3); 71 } 72 set_yellow_zone(MAX2<int>(G1ConcRefinementYellowZone, green_zone())); 73 74 if (FLAG_IS_DEFAULT(G1ConcRefinementRedZone)) { 75 FLAG_SET_DEFAULT(G1ConcRefinementRedZone, yellow_zone() * 2); 76 } 77 set_red_zone(MAX2<int>(G1ConcRefinementRedZone, yellow_zone())); 78 _n_worker_threads = thread_num(); 79 // We need one extra thread to do the young gen rset size sampling. 80 _n_threads = _n_worker_threads + 1; 81 reset_threshold_step(); 82 83 _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads, mtGC); 84 int worker_id_offset = (int)DirtyCardQueueSet::num_par_ids(); 85 ConcurrentG1RefineThread *next = NULL; 86 for (int i = _n_threads - 1; i >= 0; i--) { 87 ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i); 88 assert(t != NULL, "Conc refine should have been created"); 89 assert(t->cg1r() == this, "Conc refine thread should refer to this"); 90 _threads[i] = t; 91 next = t; 92 } 93 } 94 95 void ConcurrentG1Refine::reset_threshold_step() { 96 if (FLAG_IS_DEFAULT(G1ConcRefinementThresholdStep)) { 97 _thread_threshold_step = (yellow_zone() - green_zone()) / (worker_thread_num() + 1); 98 } else { 99 _thread_threshold_step = G1ConcRefinementThresholdStep; 100 } 101 } 102 103 int ConcurrentG1Refine::thread_num() { 104 return MAX2<int>((G1ConcRefinementThreads > 0) ? G1ConcRefinementThreads : ParallelGCThreads, 1); 105 } 106 107 void ConcurrentG1Refine::init() { 108 if (G1ConcRSLogCacheSize > 0) { 109 _g1h = G1CollectedHeap::heap(); 110 111 _max_cards = _g1h->max_capacity() >> CardTableModRefBS::card_shift; 112 _max_n_card_counts = _max_cards * G1MaxHotCardCountSizePercent / 100; 113 114 size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1; 115 guarantee(_max_cards < max_card_num, "card_num representation"); 116 117 // We need _n_card_counts to be less than _max_n_card_counts here 118 // so that the expansion call (below) actually allocates the 119 // _counts and _epochs arrays. 120 assert(_n_card_counts == 0, "pre-condition"); 121 assert(_max_n_card_counts > 0, "pre-condition"); 122 123 // Find the index into cache size array that is of a size that's 124 // large enough to hold desired_sz. 125 size_t desired_sz = _max_cards / InitialCacheFraction; 126 int desired_sz_index = 0; 127 while (_cc_cache_sizes[desired_sz_index] < desired_sz) { 128 desired_sz_index += 1; 129 assert(desired_sz_index < MAX_CC_CACHE_INDEX, "invariant"); 130 } 131 assert(desired_sz_index < MAX_CC_CACHE_INDEX, "invariant"); 132 133 // If the desired_sz value is between two sizes then 134 // _cc_cache_sizes[desired_sz_index-1] < desired_sz <= _cc_cache_sizes[desired_sz_index] 135 // we will start with the lower size in the optimistic expectation that 136 // we will not need to expand up. Note desired_sz_index could also be 0. 137 if (desired_sz_index > 0 && 138 _cc_cache_sizes[desired_sz_index] > desired_sz) { 139 desired_sz_index -= 1; 140 } 141 142 if (!expand_card_count_cache(desired_sz_index)) { 143 // Allocation was unsuccessful - exit 144 vm_exit_during_initialization("Could not reserve enough space for card count cache"); 145 } 146 assert(_n_card_counts > 0, "post-condition"); 147 assert(_cache_size_index == desired_sz_index, "post-condition"); 148 149 Copy::fill_to_bytes(&_card_counts[0], 150 _n_card_counts * sizeof(CardCountCacheEntry)); 151 Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry)); 152 153 ModRefBarrierSet* bs = _g1h->mr_bs(); 154 guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition"); 155 _ct_bs = (CardTableModRefBS*)bs; 156 _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start()); 157 158 _def_use_cache = true; 159 _use_cache = true; 160 _hot_cache_size = (1 << G1ConcRSLogCacheSize); 161 _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size, mtGC); 162 _n_hot = 0; 163 _hot_cache_idx = 0; 164 165 // For refining the cards in the hot cache in parallel 166 int n_workers = (ParallelGCThreads > 0 ? 167 _g1h->workers()->total_workers() : 1); 168 _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers); 169 _hot_cache_par_claimed_idx = 0; 170 } 171 } 172 173 void ConcurrentG1Refine::stop() { 174 if (_threads != NULL) { 175 for (int i = 0; i < _n_threads; i++) { 176 _threads[i]->stop(); 177 } 178 } 179 } 180 181 void ConcurrentG1Refine::reinitialize_threads() { 182 reset_threshold_step(); 183 if (_threads != NULL) { 184 for (int i = 0; i < _n_threads; i++) { 185 _threads[i]->initialize(); 186 } 187 } 188 } 189 190 ConcurrentG1Refine::~ConcurrentG1Refine() { 191 if (G1ConcRSLogCacheSize > 0) { 192 // Please see the comment in allocate_card_count_cache 193 // for why we call os::malloc() and os::free() directly. 194 assert(_card_counts != NULL, "Logic"); 195 os::free(_card_counts, mtGC); 196 assert(_card_epochs != NULL, "Logic"); 197 os::free(_card_epochs, mtGC); 198 199 assert(_hot_cache != NULL, "Logic"); 200 FREE_C_HEAP_ARRAY(jbyte*, _hot_cache, mtGC); 201 } 202 if (_threads != NULL) { 203 for (int i = 0; i < _n_threads; i++) { 204 delete _threads[i]; 205 } 206 FREE_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _threads, mtGC); 207 } 208 } 209 210 void ConcurrentG1Refine::threads_do(ThreadClosure *tc) { 211 if (_threads != NULL) { 212 for (int i = 0; i < _n_threads; i++) { 213 tc->do_thread(_threads[i]); 214 } 215 } 216 } 217 218 bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) { 219 HeapWord* start = _ct_bs->addr_for(card_ptr); 220 HeapRegion* r = _g1h->heap_region_containing(start); 221 if (r != NULL && r->is_young()) { 222 return true; 223 } 224 // This card is not associated with a heap region 225 // so can't be young. 226 return false; 227 } 228 229 jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) { 230 unsigned new_card_num = ptr_2_card_num(card_ptr); 231 unsigned bucket = hash(new_card_num); 232 assert(0 <= bucket && bucket < _n_card_counts, "Bounds"); 233 234 CardCountCacheEntry* count_ptr = &_card_counts[bucket]; 235 CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket]; 236 237 // We have to construct a new entry if we haven't updated the counts 238 // during the current period, or if the count was updated for a 239 // different card number. 240 unsigned int new_epoch = (unsigned int) _n_periods; 241 julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch); 242 243 while (true) { 244 // Fetch the previous epoch value 245 julong prev_epoch_entry = epoch_ptr->_value; 246 julong cas_res; 247 248 if (extract_epoch(prev_epoch_entry) != new_epoch) { 249 // This entry has not yet been updated during this period. 250 // Note: we update the epoch value atomically to ensure 251 // that there is only one winner that updates the cached 252 // card_ptr value even though all the refine threads share 253 // the same epoch value. 254 255 cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry, 256 (volatile jlong*)&epoch_ptr->_value, 257 (jlong) prev_epoch_entry); 258 259 if (cas_res == prev_epoch_entry) { 260 // We have successfully won the race to update the 261 // epoch and card_num value. Make it look like the 262 // count and eviction count were previously cleared. 263 count_ptr->_count = 1; 264 count_ptr->_evict_count = 0; 265 *count = 0; 266 // We can defer the processing of card_ptr 267 *defer = true; 268 return card_ptr; 269 } 270 // We did not win the race to update the epoch field, so some other 271 // thread must have done it. The value that gets returned by CAS 272 // should be the new epoch value. 273 assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch"); 274 // We could 'continue' here or just re-read the previous epoch value 275 prev_epoch_entry = epoch_ptr->_value; 276 } 277 278 // The epoch entry for card_ptr has been updated during this period. 279 unsigned old_card_num = extract_card_num(prev_epoch_entry); 280 281 // The card count that will be returned to caller 282 *count = count_ptr->_count; 283 284 // Are we updating the count for the same card? 285 if (new_card_num == old_card_num) { 286 // Same card - just update the count. We could have more than one 287 // thread racing to update count for the current card. It should be 288 // OK not to use a CAS as the only penalty should be some missed 289 // increments of the count which delays identifying the card as "hot". 290 291 if (*count < max_jubyte) count_ptr->_count++; 292 // We can defer the processing of card_ptr 293 *defer = true; 294 return card_ptr; 295 } 296 297 // Different card - evict old card info 298 if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++; 299 if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) { 300 // Trigger a resize the next time we clear 301 _expand_card_counts = true; 302 } 303 304 cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry, 305 (volatile jlong*)&epoch_ptr->_value, 306 (jlong) prev_epoch_entry); 307 308 if (cas_res == prev_epoch_entry) { 309 // We successfully updated the card num value in the epoch entry 310 count_ptr->_count = 0; // initialize counter for new card num 311 jbyte* old_card_ptr = card_num_2_ptr(old_card_num); 312 313 // Even though the region containg the card at old_card_num was not 314 // in the young list when old_card_num was recorded in the epoch 315 // cache it could have been added to the free list and subsequently 316 // added to the young list in the intervening time. See CR 6817995. 317 // We do not deal with this case here - it will be handled in 318 // HeapRegion::oops_on_card_seq_iterate_careful after it has been 319 // determined that the region containing the card has been allocated 320 // to, and it's safe to check the young type of the region. 321 322 // We do not want to defer processing of card_ptr in this case 323 // (we need to refine old_card_ptr and card_ptr) 324 *defer = false; 325 return old_card_ptr; 326 } 327 // Someone else beat us - try again. 328 } 329 } 330 331 jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) { 332 int count; 333 jbyte* cached_ptr = add_card_count(card_ptr, &count, defer); 334 assert(cached_ptr != NULL, "bad cached card ptr"); 335 336 // We've just inserted a card pointer into the card count cache 337 // and got back the card that we just inserted or (evicted) the 338 // previous contents of that count slot. 339 340 // The card we got back could be in a young region. When the 341 // returned card (if evicted) was originally inserted, we had 342 // determined that its containing region was not young. However 343 // it is possible for the region to be freed during a cleanup 344 // pause, then reallocated and tagged as young which will result 345 // in the returned card residing in a young region. 346 // 347 // We do not deal with this case here - the change from non-young 348 // to young could be observed at any time - it will be handled in 349 // HeapRegion::oops_on_card_seq_iterate_careful after it has been 350 // determined that the region containing the card has been allocated 351 // to. 352 353 // The card pointer we obtained from card count cache is not hot 354 // so do not store it in the cache; return it for immediate 355 // refining. 356 if (count < G1ConcRSHotCardLimit) { 357 return cached_ptr; 358 } 359 360 // Otherwise, the pointer we got from the _card_counts cache is hot. 361 jbyte* res = NULL; 362 MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag); 363 if (_n_hot == _hot_cache_size) { 364 res = _hot_cache[_hot_cache_idx]; 365 _n_hot--; 366 } 367 // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx. 368 _hot_cache[_hot_cache_idx] = cached_ptr; 369 _hot_cache_idx++; 370 if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0; 371 _n_hot++; 372 373 // The card obtained from the hot card cache could be in a young 374 // region. See above on how this can happen. 375 376 return res; 377 } 378 379 void ConcurrentG1Refine::clean_up_cache(int worker_i, 380 G1RemSet* g1rs, 381 DirtyCardQueue* into_cset_dcq) { 382 assert(!use_cache(), "cache should be disabled"); 383 int start_idx; 384 385 while ((start_idx = _hot_cache_par_claimed_idx) < _n_hot) { // read once 386 int end_idx = start_idx + _hot_cache_par_chunk_size; 387 388 if (start_idx == 389 Atomic::cmpxchg(end_idx, &_hot_cache_par_claimed_idx, start_idx)) { 390 // The current worker has successfully claimed the chunk [start_idx..end_idx) 391 end_idx = MIN2(end_idx, _n_hot); 392 for (int i = start_idx; i < end_idx; i++) { 393 jbyte* entry = _hot_cache[i]; 394 if (entry != NULL) { 395 if (g1rs->concurrentRefineOneCard(entry, worker_i, true)) { 396 // 'entry' contains references that point into the current 397 // collection set. We need to record 'entry' in the DCQS 398 // that's used for that purpose. 399 // 400 // The only time we care about recording cards that contain 401 // references that point into the collection set is during 402 // RSet updating while within an evacuation pause. 403 // In this case worker_i should be the id of a GC worker thread 404 assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause"); 405 assert(worker_i < (int) (ParallelGCThreads == 0 ? 1 : ParallelGCThreads), "incorrect worker id"); 406 into_cset_dcq->enqueue(entry); 407 } 408 } 409 } 410 } 411 } 412 } 413 414 // The arrays used to hold the card counts and the epochs must have 415 // a 1:1 correspondence. Hence they are allocated and freed together 416 // Returns true if the allocations of both the counts and epochs 417 // were successful; false otherwise. 418 bool ConcurrentG1Refine::allocate_card_count_cache(size_t n, 419 CardCountCacheEntry** counts, 420 CardEpochCacheEntry** epochs) { 421 // We call the allocation/free routines directly for the counts 422 // and epochs arrays. The NEW_C_HEAP_ARRAY/FREE_C_HEAP_ARRAY 423 // macros call AllocateHeap and FreeHeap respectively. 424 // AllocateHeap will call vm_exit_out_of_memory in the event 425 // of an allocation failure and abort the JVM. With the 426 // _counts/epochs arrays we only need to abort the JVM if the 427 // initial allocation of these arrays fails. 428 // 429 // Additionally AllocateHeap/FreeHeap do some tracing of 430 // allocate/free calls so calling one without calling the 431 // other can cause inconsistencies in the tracing. So we 432 // call neither. 433 434 assert(*counts == NULL, "out param"); 435 assert(*epochs == NULL, "out param"); 436 437 size_t counts_size = n * sizeof(CardCountCacheEntry); 438 size_t epochs_size = n * sizeof(CardEpochCacheEntry); 439 440 *counts = (CardCountCacheEntry*) os::malloc(counts_size, mtGC); 441 if (*counts == NULL) { 442 // allocation was unsuccessful 443 return false; 444 } 445 446 *epochs = (CardEpochCacheEntry*) os::malloc(epochs_size, mtGC); 447 if (*epochs == NULL) { 448 // allocation was unsuccessful - free counts array 449 assert(*counts != NULL, "must be"); 450 os::free(*counts, mtGC); 451 *counts = NULL; 452 return false; 453 } 454 455 // We successfully allocated both counts and epochs 456 return true; 457 } 458 459 // Returns true if the card counts/epochs cache was 460 // successfully expanded; false otherwise. 461 bool ConcurrentG1Refine::expand_card_count_cache(int cache_size_idx) { 462 // Can we expand the card count and epoch tables? 463 if (_n_card_counts < _max_n_card_counts) { 464 assert(cache_size_idx >= 0 && cache_size_idx < MAX_CC_CACHE_INDEX, "oob"); 465 466 size_t cache_size = _cc_cache_sizes[cache_size_idx]; 467 // Make sure we don't go bigger than we will ever need 468 cache_size = MIN2(cache_size, _max_n_card_counts); 469 470 // Should we expand the card count and card epoch tables? 471 if (cache_size > _n_card_counts) { 472 // We have been asked to allocate new, larger, arrays for 473 // the card counts and the epochs. Attempt the allocation 474 // of both before we free the existing arrays in case 475 // the allocation is unsuccessful... 476 CardCountCacheEntry* counts = NULL; 477 CardEpochCacheEntry* epochs = NULL; 478 479 if (allocate_card_count_cache(cache_size, &counts, &epochs)) { 480 // Allocation was successful. 481 // We can just free the old arrays; we're 482 // not interested in preserving the contents 483 if (_card_counts != NULL) os::free(_card_counts, mtGC); 484 if (_card_epochs != NULL) os::free(_card_epochs, mtGC); 485 486 // Cache the size of the arrays and the index that got us there. 487 _n_card_counts = cache_size; 488 _cache_size_index = cache_size_idx; 489 490 _card_counts = counts; 491 _card_epochs = epochs; 492 493 // We successfully allocated/expanded the caches. 494 return true; 495 } 496 } 497 } 498 499 // We did not successfully expand the caches. 500 return false; 501 } 502 503 void ConcurrentG1Refine::clear_and_record_card_counts() { 504 if (G1ConcRSLogCacheSize == 0) { 505 return; 506 } 507 508 double start = os::elapsedTime(); 509 510 if (_expand_card_counts) { 511 int new_idx = _cache_size_index + 1; 512 513 if (expand_card_count_cache(new_idx)) { 514 // Allocation was successful and _n_card_counts has 515 // been updated to the new size. We only need to clear 516 // the epochs so we don't read a bogus epoch value 517 // when inserting a card into the hot card cache. 518 Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry)); 519 } 520 _expand_card_counts = false; 521 } 522 523 int this_epoch = (int) _n_periods; 524 assert((this_epoch+1) <= max_jint, "to many periods"); 525 // Update epoch 526 _n_periods++; 527 double cc_clear_time_ms = (os::elapsedTime() - start) * 1000; 528 _g1h->g1_policy()->phase_times()->record_cc_clear_time_ms(cc_clear_time_ms); 529 } 530 531 void ConcurrentG1Refine::print_worker_threads_on(outputStream* st) const { 532 for (int i = 0; i < _n_threads; ++i) { 533 _threads[i]->print_on(st); 534 st->cr(); 535 } 536 }