1 /*
   2  * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "gc_implementation/g1/concurrentG1Refine.hpp"
  27 #include "gc_implementation/g1/concurrentG1RefineThread.hpp"
  28 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  29 #include "gc_implementation/g1/g1CollectorPolicy.hpp"
  30 #include "gc_implementation/g1/g1GCPhaseTimes.hpp"
  31 #include "gc_implementation/g1/g1RemSet.hpp"
  32 #include "gc_implementation/g1/heapRegionSeq.inline.hpp"
  33 #include "memory/space.inline.hpp"
  34 #include "runtime/atomic.hpp"
  35 #include "runtime/java.hpp"
  36 #include "utilities/copy.hpp"
  37 
  38 // Possible sizes for the card counts cache: odd primes that roughly double in size.
  39 // (See jvmtiTagMap.cpp).
  40 
  41 #define MAX_SIZE ((size_t) -1)
  42 
  43 size_t ConcurrentG1Refine::_cc_cache_sizes[] = {
  44           16381,    32771,    76831,    150001,   307261,
  45          614563,  1228891,  2457733,   4915219,  9830479,
  46        19660831, 39321619, 78643219, 157286461,  MAX_SIZE
  47   };
  48 
  49 ConcurrentG1Refine::ConcurrentG1Refine() :
  50   _card_counts(NULL), _card_epochs(NULL),
  51   _n_card_counts(0), _max_cards(0), _max_n_card_counts(0),
  52   _cache_size_index(0), _expand_card_counts(false),
  53   _hot_cache(NULL),
  54   _def_use_cache(false), _use_cache(false),
  55   // We initialize the epochs of the array to 0. By initializing
  56   // _n_periods to 1 and not 0 we automatically invalidate all the
  57   // entries on the array. Otherwise we might accidentally think that
  58   // we claimed a card that was in fact never set (see CR7033292).
  59   _n_periods(1),
  60   _threads(NULL), _n_threads(0)
  61 {
  62 
  63   // Ergomonically select initial concurrent refinement parameters
  64   if (FLAG_IS_DEFAULT(G1ConcRefinementGreenZone)) {
  65     FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, MAX2<int>(ParallelGCThreads, 1));
  66   }
  67   set_green_zone(G1ConcRefinementGreenZone);
  68 
  69   if (FLAG_IS_DEFAULT(G1ConcRefinementYellowZone)) {
  70     FLAG_SET_DEFAULT(G1ConcRefinementYellowZone, green_zone() * 3);
  71   }
  72   set_yellow_zone(MAX2<int>(G1ConcRefinementYellowZone, green_zone()));
  73 
  74   if (FLAG_IS_DEFAULT(G1ConcRefinementRedZone)) {
  75     FLAG_SET_DEFAULT(G1ConcRefinementRedZone, yellow_zone() * 2);
  76   }
  77   set_red_zone(MAX2<int>(G1ConcRefinementRedZone, yellow_zone()));
  78   _n_worker_threads = thread_num();
  79   // We need one extra thread to do the young gen rset size sampling.
  80   _n_threads = _n_worker_threads + 1;
  81   reset_threshold_step();
  82 
  83   _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads, mtGC);
  84   int worker_id_offset = (int)DirtyCardQueueSet::num_par_ids();
  85   ConcurrentG1RefineThread *next = NULL;
  86   for (int i = _n_threads - 1; i >= 0; i--) {
  87     ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i);
  88     assert(t != NULL, "Conc refine should have been created");
  89     assert(t->cg1r() == this, "Conc refine thread should refer to this");
  90     _threads[i] = t;
  91     next = t;
  92   }
  93 }
  94 
  95 void ConcurrentG1Refine::reset_threshold_step() {
  96   if (FLAG_IS_DEFAULT(G1ConcRefinementThresholdStep)) {
  97     _thread_threshold_step = (yellow_zone() - green_zone()) / (worker_thread_num() + 1);
  98   } else {
  99     _thread_threshold_step = G1ConcRefinementThresholdStep;
 100   }
 101 }
 102 
 103 int ConcurrentG1Refine::thread_num() {
 104   return MAX2<int>((G1ConcRefinementThreads > 0) ? G1ConcRefinementThreads : ParallelGCThreads, 1);
 105 }
 106 
 107 void ConcurrentG1Refine::init() {
 108   if (G1ConcRSLogCacheSize > 0) {
 109     _g1h = G1CollectedHeap::heap();
 110 
 111     _max_cards = _g1h->max_capacity() >> CardTableModRefBS::card_shift;
 112     _max_n_card_counts = _max_cards * G1MaxHotCardCountSizePercent / 100;
 113 
 114     size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
 115     guarantee(_max_cards < max_card_num, "card_num representation");
 116 
 117     // We need _n_card_counts to be less than _max_n_card_counts here
 118     // so that the expansion call (below) actually allocates the
 119     // _counts and _epochs arrays.
 120     assert(_n_card_counts == 0, "pre-condition");
 121     assert(_max_n_card_counts > 0, "pre-condition");
 122 
 123     // Find the index into cache size array that is of a size that's
 124     // large enough to hold desired_sz.
 125     size_t desired_sz = _max_cards / InitialCacheFraction;
 126     int desired_sz_index = 0;
 127     while (_cc_cache_sizes[desired_sz_index] < desired_sz) {
 128       desired_sz_index += 1;
 129       assert(desired_sz_index <  MAX_CC_CACHE_INDEX, "invariant");
 130     }
 131     assert(desired_sz_index <  MAX_CC_CACHE_INDEX, "invariant");
 132 
 133     // If the desired_sz value is between two sizes then
 134     // _cc_cache_sizes[desired_sz_index-1] < desired_sz <= _cc_cache_sizes[desired_sz_index]
 135     // we will start with the lower size in the optimistic expectation that
 136     // we will not need to expand up. Note desired_sz_index could also be 0.
 137     if (desired_sz_index > 0 &&
 138         _cc_cache_sizes[desired_sz_index] > desired_sz) {
 139       desired_sz_index -= 1;
 140     }
 141 
 142     if (!expand_card_count_cache(desired_sz_index)) {
 143       // Allocation was unsuccessful - exit
 144       vm_exit_during_initialization("Could not reserve enough space for card count cache");
 145     }
 146     assert(_n_card_counts > 0, "post-condition");
 147     assert(_cache_size_index == desired_sz_index, "post-condition");
 148 
 149     Copy::fill_to_bytes(&_card_counts[0],
 150                         _n_card_counts * sizeof(CardCountCacheEntry));
 151     Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
 152 
 153     ModRefBarrierSet* bs = _g1h->mr_bs();
 154     guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
 155     _ct_bs = (CardTableModRefBS*)bs;
 156     _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
 157 
 158     _def_use_cache = true;
 159     _use_cache = true;
 160     _hot_cache_size = (1 << G1ConcRSLogCacheSize);
 161     _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size, mtGC);
 162     _n_hot = 0;
 163     _hot_cache_idx = 0;
 164 
 165     // For refining the cards in the hot cache in parallel
 166     int n_workers = (ParallelGCThreads > 0 ?
 167                         _g1h->workers()->total_workers() : 1);
 168     _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
 169     _hot_cache_par_claimed_idx = 0;
 170   }
 171 }
 172 
 173 void ConcurrentG1Refine::stop() {
 174   if (_threads != NULL) {
 175     for (int i = 0; i < _n_threads; i++) {
 176       _threads[i]->stop();
 177     }
 178   }
 179 }
 180 
 181 void ConcurrentG1Refine::reinitialize_threads() {
 182   reset_threshold_step();
 183   if (_threads != NULL) {
 184     for (int i = 0; i < _n_threads; i++) {
 185       _threads[i]->initialize();
 186     }
 187   }
 188 }
 189 
 190 ConcurrentG1Refine::~ConcurrentG1Refine() {
 191   if (G1ConcRSLogCacheSize > 0) {
 192     // Please see the comment in allocate_card_count_cache
 193     // for why we call os::malloc() and os::free() directly.
 194     assert(_card_counts != NULL, "Logic");
 195     os::free(_card_counts, mtGC);
 196     assert(_card_epochs != NULL, "Logic");
 197     os::free(_card_epochs, mtGC);
 198 
 199     assert(_hot_cache != NULL, "Logic");
 200     FREE_C_HEAP_ARRAY(jbyte*, _hot_cache, mtGC);
 201   }
 202   if (_threads != NULL) {
 203     for (int i = 0; i < _n_threads; i++) {
 204       delete _threads[i];
 205     }
 206     FREE_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _threads, mtGC);
 207   }
 208 }
 209 
 210 void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
 211   if (_threads != NULL) {
 212     for (int i = 0; i < _n_threads; i++) {
 213       tc->do_thread(_threads[i]);
 214     }
 215   }
 216 }
 217 
 218 bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
 219   HeapWord* start = _ct_bs->addr_for(card_ptr);
 220   HeapRegion* r = _g1h->heap_region_containing(start);
 221   if (r != NULL && r->is_young()) {
 222     return true;
 223   }
 224   // This card is not associated with a heap region
 225   // so can't be young.
 226   return false;
 227 }
 228 
 229 jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
 230   unsigned new_card_num = ptr_2_card_num(card_ptr);
 231   unsigned bucket = hash(new_card_num);
 232   assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
 233 
 234   CardCountCacheEntry* count_ptr = &_card_counts[bucket];
 235   CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
 236 
 237   // We have to construct a new entry if we haven't updated the counts
 238   // during the current period, or if the count was updated for a
 239   // different card number.
 240   unsigned int new_epoch = (unsigned int) _n_periods;
 241   julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);
 242 
 243   while (true) {
 244     // Fetch the previous epoch value
 245     julong prev_epoch_entry = epoch_ptr->_value;
 246     julong cas_res;
 247 
 248     if (extract_epoch(prev_epoch_entry) != new_epoch) {
 249       // This entry has not yet been updated during this period.
 250       // Note: we update the epoch value atomically to ensure
 251       // that there is only one winner that updates the cached
 252       // card_ptr value even though all the refine threads share
 253       // the same epoch value.
 254 
 255       cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
 256                                          (volatile jlong*)&epoch_ptr->_value,
 257                                          (jlong) prev_epoch_entry);
 258 
 259       if (cas_res == prev_epoch_entry) {
 260         // We have successfully won the race to update the
 261         // epoch and card_num value. Make it look like the
 262         // count and eviction count were previously cleared.
 263         count_ptr->_count = 1;
 264         count_ptr->_evict_count = 0;
 265         *count = 0;
 266         // We can defer the processing of card_ptr
 267         *defer = true;
 268         return card_ptr;
 269       }
 270       // We did not win the race to update the epoch field, so some other
 271       // thread must have done it. The value that gets returned by CAS
 272       // should be the new epoch value.
 273       assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
 274       // We could 'continue' here or just re-read the previous epoch value
 275       prev_epoch_entry = epoch_ptr->_value;
 276     }
 277 
 278     // The epoch entry for card_ptr has been updated during this period.
 279     unsigned old_card_num = extract_card_num(prev_epoch_entry);
 280 
 281     // The card count that will be returned to caller
 282     *count = count_ptr->_count;
 283 
 284     // Are we updating the count for the same card?
 285     if (new_card_num == old_card_num) {
 286       // Same card - just update the count. We could have more than one
 287       // thread racing to update count for the current card. It should be
 288       // OK not to use a CAS as the only penalty should be some missed
 289       // increments of the count which delays identifying the card as "hot".
 290 
 291       if (*count < max_jubyte) count_ptr->_count++;
 292       // We can defer the processing of card_ptr
 293       *defer = true;
 294       return card_ptr;
 295     }
 296 
 297     // Different card - evict old card info
 298     if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
 299     if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
 300       // Trigger a resize the next time we clear
 301       _expand_card_counts = true;
 302     }
 303 
 304     cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
 305                                        (volatile jlong*)&epoch_ptr->_value,
 306                                        (jlong) prev_epoch_entry);
 307 
 308     if (cas_res == prev_epoch_entry) {
 309       // We successfully updated the card num value in the epoch entry
 310       count_ptr->_count = 0; // initialize counter for new card num
 311       jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
 312 
 313       // Even though the region containg the card at old_card_num was not
 314       // in the young list when old_card_num was recorded in the epoch
 315       // cache it could have been added to the free list and subsequently
 316       // added to the young list in the intervening time. See CR 6817995.
 317       // We do not deal with this case here - it will be handled in
 318       // HeapRegion::oops_on_card_seq_iterate_careful after it has been
 319       // determined that the region containing the card has been allocated
 320       // to, and it's safe to check the young type of the region.
 321 
 322       // We do not want to defer processing of card_ptr in this case
 323       // (we need to refine old_card_ptr and card_ptr)
 324       *defer = false;
 325       return old_card_ptr;
 326     }
 327     // Someone else beat us - try again.
 328   }
 329 }
 330 
 331 jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
 332   int count;
 333   jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
 334   assert(cached_ptr != NULL, "bad cached card ptr");
 335 
 336   // We've just inserted a card pointer into the card count cache
 337   // and got back the card that we just inserted or (evicted) the
 338   // previous contents of that count slot.
 339 
 340   // The card we got back could be in a young region. When the
 341   // returned card (if evicted) was originally inserted, we had
 342   // determined that its containing region was not young. However
 343   // it is possible for the region to be freed during a cleanup
 344   // pause, then reallocated and tagged as young which will result
 345   // in the returned card residing in a young region.
 346   //
 347   // We do not deal with this case here - the change from non-young
 348   // to young could be observed at any time - it will be handled in
 349   // HeapRegion::oops_on_card_seq_iterate_careful after it has been
 350   // determined that the region containing the card has been allocated
 351   // to.
 352 
 353   // The card pointer we obtained from card count cache is not hot
 354   // so do not store it in the cache; return it for immediate
 355   // refining.
 356   if (count < G1ConcRSHotCardLimit) {
 357     return cached_ptr;
 358   }
 359 
 360   // Otherwise, the pointer we got from the _card_counts cache is hot.
 361   jbyte* res = NULL;
 362   MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
 363   if (_n_hot == _hot_cache_size) {
 364     res = _hot_cache[_hot_cache_idx];
 365     _n_hot--;
 366   }
 367   // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
 368   _hot_cache[_hot_cache_idx] = cached_ptr;
 369   _hot_cache_idx++;
 370   if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
 371   _n_hot++;
 372 
 373   // The card obtained from the hot card cache could be in a young
 374   // region. See above on how this can happen.
 375 
 376   return res;
 377 }
 378 
 379 void ConcurrentG1Refine::clean_up_cache(int worker_i,
 380                                         G1RemSet* g1rs,
 381                                         DirtyCardQueue* into_cset_dcq) {
 382   assert(!use_cache(), "cache should be disabled");
 383   int start_idx;
 384 
 385   while ((start_idx = _hot_cache_par_claimed_idx) < _n_hot) { // read once
 386     int end_idx = start_idx + _hot_cache_par_chunk_size;
 387 
 388     if (start_idx ==
 389         Atomic::cmpxchg(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
 390       // The current worker has successfully claimed the chunk [start_idx..end_idx)
 391       end_idx = MIN2(end_idx, _n_hot);
 392       for (int i = start_idx; i < end_idx; i++) {
 393         jbyte* entry = _hot_cache[i];
 394         if (entry != NULL) {
 395           if (g1rs->concurrentRefineOneCard(entry, worker_i, true)) {
 396             // 'entry' contains references that point into the current
 397             // collection set. We need to record 'entry' in the DCQS
 398             // that's used for that purpose.
 399             //
 400             // The only time we care about recording cards that contain
 401             // references that point into the collection set is during
 402             // RSet updating while within an evacuation pause.
 403             // In this case worker_i should be the id of a GC worker thread
 404             assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause");
 405             assert(worker_i < (int) (ParallelGCThreads == 0 ? 1 : ParallelGCThreads), "incorrect worker id");
 406             into_cset_dcq->enqueue(entry);
 407           }
 408         }
 409       }
 410     }
 411   }
 412 }
 413 
 414 // The arrays used to hold the card counts and the epochs must have
 415 // a 1:1 correspondence. Hence they are allocated and freed together
 416 // Returns true if the allocations of both the counts and epochs
 417 // were successful; false otherwise.
 418 bool ConcurrentG1Refine::allocate_card_count_cache(size_t n,
 419                                                    CardCountCacheEntry** counts,
 420                                                    CardEpochCacheEntry** epochs) {
 421   // We call the allocation/free routines directly for the counts
 422   // and epochs arrays. The NEW_C_HEAP_ARRAY/FREE_C_HEAP_ARRAY
 423   // macros call AllocateHeap and FreeHeap respectively.
 424   // AllocateHeap will call vm_exit_out_of_memory in the event
 425   // of an allocation failure and abort the JVM. With the
 426   // _counts/epochs arrays we only need to abort the JVM if the
 427   // initial allocation of these arrays fails.
 428   //
 429   // Additionally AllocateHeap/FreeHeap do some tracing of
 430   // allocate/free calls so calling one without calling the
 431   // other can cause inconsistencies in the tracing. So we
 432   // call neither.
 433 
 434   assert(*counts == NULL, "out param");
 435   assert(*epochs == NULL, "out param");
 436 
 437   size_t counts_size = n * sizeof(CardCountCacheEntry);
 438   size_t epochs_size = n * sizeof(CardEpochCacheEntry);
 439 
 440   *counts = (CardCountCacheEntry*) os::malloc(counts_size, mtGC);
 441   if (*counts == NULL) {
 442     // allocation was unsuccessful
 443     return false;
 444   }
 445 
 446   *epochs = (CardEpochCacheEntry*) os::malloc(epochs_size, mtGC);
 447   if (*epochs == NULL) {
 448     // allocation was unsuccessful - free counts array
 449     assert(*counts != NULL, "must be");
 450     os::free(*counts, mtGC);
 451     *counts = NULL;
 452     return false;
 453   }
 454 
 455   // We successfully allocated both counts and epochs
 456   return true;
 457 }
 458 
 459 // Returns true if the card counts/epochs cache was
 460 // successfully expanded; false otherwise.
 461 bool ConcurrentG1Refine::expand_card_count_cache(int cache_size_idx) {
 462   // Can we expand the card count and epoch tables?
 463   if (_n_card_counts < _max_n_card_counts) {
 464     assert(cache_size_idx >= 0 && cache_size_idx  < MAX_CC_CACHE_INDEX, "oob");
 465 
 466     size_t cache_size = _cc_cache_sizes[cache_size_idx];
 467     // Make sure we don't go bigger than we will ever need
 468     cache_size = MIN2(cache_size, _max_n_card_counts);
 469 
 470     // Should we expand the card count and card epoch tables?
 471     if (cache_size > _n_card_counts) {
 472       // We have been asked to allocate new, larger, arrays for
 473       // the card counts and the epochs. Attempt the allocation
 474       // of both before we free the existing arrays in case
 475       // the allocation is unsuccessful...
 476       CardCountCacheEntry* counts = NULL;
 477       CardEpochCacheEntry* epochs = NULL;
 478 
 479       if (allocate_card_count_cache(cache_size, &counts, &epochs)) {
 480         // Allocation was successful.
 481         // We can just free the old arrays; we're
 482         // not interested in preserving the contents
 483         if (_card_counts != NULL) os::free(_card_counts, mtGC);
 484         if (_card_epochs != NULL) os::free(_card_epochs, mtGC);
 485 
 486         // Cache the size of the arrays and the index that got us there.
 487         _n_card_counts = cache_size;
 488         _cache_size_index = cache_size_idx;
 489 
 490         _card_counts = counts;
 491         _card_epochs = epochs;
 492 
 493         // We successfully allocated/expanded the caches.
 494         return true;
 495       }
 496     }
 497   }
 498 
 499   // We did not successfully expand the caches.
 500   return false;
 501 }
 502 
 503 void ConcurrentG1Refine::clear_and_record_card_counts() {
 504   if (G1ConcRSLogCacheSize == 0) {
 505     return;
 506   }
 507 
 508   double start = os::elapsedTime();
 509 
 510   if (_expand_card_counts) {
 511     int new_idx = _cache_size_index + 1;
 512 
 513     if (expand_card_count_cache(new_idx)) {
 514       // Allocation was successful and  _n_card_counts has
 515       // been updated to the new size. We only need to clear
 516       // the epochs so we don't read a bogus epoch value
 517       // when inserting a card into the hot card cache.
 518       Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
 519     }
 520     _expand_card_counts = false;
 521   }
 522 
 523   int this_epoch = (int) _n_periods;
 524   assert((this_epoch+1) <= max_jint, "to many periods");
 525   // Update epoch
 526   _n_periods++;
 527   double cc_clear_time_ms = (os::elapsedTime() - start) * 1000;
 528   _g1h->g1_policy()->phase_times()->record_cc_clear_time_ms(cc_clear_time_ms);
 529 }
 530 
 531 void ConcurrentG1Refine::print_worker_threads_on(outputStream* st) const {
 532   for (int i = 0; i < _n_threads; ++i) {
 533     _threads[i]->print_on(st);
 534     st->cr();
 535   }
 536 }