1 /*
   2  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "incls/_precompiled.incl"
  26 #include "incls/_concurrentG1Refine.cpp.incl"
  27 
  28 // Possible sizes for the card counts cache: odd primes that roughly double in size.
  29 // (See jvmtiTagMap.cpp).
  30 int ConcurrentG1Refine::_cc_cache_sizes[] = {
  31         16381,    32771,    76831,    150001,   307261,
  32        614563,  1228891,  2457733,   4915219,  9830479,
  33      19660831, 39321619, 78643219, 157286461,       -1
  34   };
  35 
  36 ConcurrentG1Refine::ConcurrentG1Refine() :
  37   _card_counts(NULL), _card_epochs(NULL),
  38   _n_card_counts(0), _max_n_card_counts(0),
  39   _cache_size_index(0), _expand_card_counts(false),
  40   _hot_cache(NULL),
  41   _def_use_cache(false), _use_cache(false),
  42   _n_periods(0),
  43   _threads(NULL), _n_threads(0)
  44 {
  45 
  46   // Ergomonically select initial concurrent refinement parameters
  47   if (FLAG_IS_DEFAULT(G1ConcRefinementGreenZone)) {
  48     FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, MAX2<int>(ParallelGCThreads, 1));
  49   }
  50   set_green_zone(G1ConcRefinementGreenZone);
  51 
  52   if (FLAG_IS_DEFAULT(G1ConcRefinementYellowZone)) {
  53     FLAG_SET_DEFAULT(G1ConcRefinementYellowZone, green_zone() * 3);
  54   }
  55   set_yellow_zone(MAX2<int>(G1ConcRefinementYellowZone, green_zone()));
  56 
  57   if (FLAG_IS_DEFAULT(G1ConcRefinementRedZone)) {
  58     FLAG_SET_DEFAULT(G1ConcRefinementRedZone, yellow_zone() * 2);
  59   }
  60   set_red_zone(MAX2<int>(G1ConcRefinementRedZone, yellow_zone()));
  61   _n_worker_threads = thread_num();
  62   // We need one extra thread to do the young gen rset size sampling.
  63   _n_threads = _n_worker_threads + 1;
  64   reset_threshold_step();
  65 
  66   _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads);
  67   int worker_id_offset = (int)DirtyCardQueueSet::num_par_ids();
  68   ConcurrentG1RefineThread *next = NULL;
  69   for (int i = _n_threads - 1; i >= 0; i--) {
  70     ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i);
  71     assert(t != NULL, "Conc refine should have been created");
  72     assert(t->cg1r() == this, "Conc refine thread should refer to this");
  73     _threads[i] = t;
  74     next = t;
  75   }
  76 }
  77 
  78 void ConcurrentG1Refine::reset_threshold_step() {
  79   if (FLAG_IS_DEFAULT(G1ConcRefinementThresholdStep)) {
  80     _thread_threshold_step = (yellow_zone() - green_zone()) / (worker_thread_num() + 1);
  81   } else {
  82     _thread_threshold_step = G1ConcRefinementThresholdStep;
  83   }
  84 }
  85 
  86 int ConcurrentG1Refine::thread_num() {
  87   return MAX2<int>((G1ConcRefinementThreads > 0) ? G1ConcRefinementThreads : ParallelGCThreads, 1);
  88 }
  89 
  90 void ConcurrentG1Refine::init() {
  91   if (G1ConcRSLogCacheSize > 0) {
  92     _g1h = G1CollectedHeap::heap();
  93     _max_n_card_counts =
  94       (unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
  95 
  96     size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
  97     guarantee(_max_n_card_counts < max_card_num, "card_num representation");
  98 
  99     int desired = _max_n_card_counts / InitialCacheFraction;
 100     for (_cache_size_index = 0;
 101               _cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) {
 102       if (_cc_cache_sizes[_cache_size_index] >= desired) break;
 103     }
 104     _cache_size_index = MAX2(0, (_cache_size_index - 1));
 105 
 106     int initial_size = _cc_cache_sizes[_cache_size_index];
 107     if (initial_size < 0) initial_size = _max_n_card_counts;
 108 
 109     // Make sure we don't go bigger than we will ever need
 110     _n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts);
 111 
 112     _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
 113     _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
 114 
 115     Copy::fill_to_bytes(&_card_counts[0],
 116                         _n_card_counts * sizeof(CardCountCacheEntry));
 117     Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
 118 
 119     ModRefBarrierSet* bs = _g1h->mr_bs();
 120     guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
 121     _ct_bs = (CardTableModRefBS*)bs;
 122     _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
 123 
 124     _def_use_cache = true;
 125     _use_cache = true;
 126     _hot_cache_size = (1 << G1ConcRSLogCacheSize);
 127     _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size);
 128     _n_hot = 0;
 129     _hot_cache_idx = 0;
 130 
 131     // For refining the cards in the hot cache in parallel
 132     int n_workers = (ParallelGCThreads > 0 ?
 133                         _g1h->workers()->total_workers() : 1);
 134     _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
 135     _hot_cache_par_claimed_idx = 0;
 136   }
 137 }
 138 
 139 void ConcurrentG1Refine::stop() {
 140   if (_threads != NULL) {
 141     for (int i = 0; i < _n_threads; i++) {
 142       _threads[i]->stop();
 143     }
 144   }
 145 }
 146 
 147 void ConcurrentG1Refine::reinitialize_threads() {
 148   reset_threshold_step();
 149   if (_threads != NULL) {
 150     for (int i = 0; i < _n_threads; i++) {
 151       _threads[i]->initialize();
 152     }
 153   }
 154 }
 155 
 156 ConcurrentG1Refine::~ConcurrentG1Refine() {
 157   if (G1ConcRSLogCacheSize > 0) {
 158     assert(_card_counts != NULL, "Logic");
 159     FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
 160     assert(_card_epochs != NULL, "Logic");
 161     FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
 162     assert(_hot_cache != NULL, "Logic");
 163     FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
 164   }
 165   if (_threads != NULL) {
 166     for (int i = 0; i < _n_threads; i++) {
 167       delete _threads[i];
 168     }
 169     FREE_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _threads);
 170   }
 171 }
 172 
 173 void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
 174   if (_threads != NULL) {
 175     for (int i = 0; i < _n_threads; i++) {
 176       tc->do_thread(_threads[i]);
 177     }
 178   }
 179 }
 180 
 181 bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
 182   HeapWord* start = _ct_bs->addr_for(card_ptr);
 183   HeapRegion* r = _g1h->heap_region_containing(start);
 184   if (r != NULL && r->is_young()) {
 185     return true;
 186   }
 187   // This card is not associated with a heap region
 188   // so can't be young.
 189   return false;
 190 }
 191 
 192 jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
 193   unsigned new_card_num = ptr_2_card_num(card_ptr);
 194   unsigned bucket = hash(new_card_num);
 195   assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
 196 
 197   CardCountCacheEntry* count_ptr = &_card_counts[bucket];
 198   CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
 199 
 200   // We have to construct a new entry if we haven't updated the counts
 201   // during the current period, or if the count was updated for a
 202   // different card number.
 203   unsigned int new_epoch = (unsigned int) _n_periods;
 204   julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);
 205 
 206   while (true) {
 207     // Fetch the previous epoch value
 208     julong prev_epoch_entry = epoch_ptr->_value;
 209     julong cas_res;
 210 
 211     if (extract_epoch(prev_epoch_entry) != new_epoch) {
 212       // This entry has not yet been updated during this period.
 213       // Note: we update the epoch value atomically to ensure
 214       // that there is only one winner that updates the cached
 215       // card_ptr value even though all the refine threads share
 216       // the same epoch value.
 217 
 218       cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
 219                                          (volatile jlong*)&epoch_ptr->_value,
 220                                          (jlong) prev_epoch_entry);
 221 
 222       if (cas_res == prev_epoch_entry) {
 223         // We have successfully won the race to update the
 224         // epoch and card_num value. Make it look like the
 225         // count and eviction count were previously cleared.
 226         count_ptr->_count = 1;
 227         count_ptr->_evict_count = 0;
 228         *count = 0;
 229         // We can defer the processing of card_ptr
 230         *defer = true;
 231         return card_ptr;
 232       }
 233       // We did not win the race to update the epoch field, so some other
 234       // thread must have done it. The value that gets returned by CAS
 235       // should be the new epoch value.
 236       assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
 237       // We could 'continue' here or just re-read the previous epoch value
 238       prev_epoch_entry = epoch_ptr->_value;
 239     }
 240 
 241     // The epoch entry for card_ptr has been updated during this period.
 242     unsigned old_card_num = extract_card_num(prev_epoch_entry);
 243 
 244     // The card count that will be returned to caller
 245     *count = count_ptr->_count;
 246 
 247     // Are we updating the count for the same card?
 248     if (new_card_num == old_card_num) {
 249       // Same card - just update the count. We could have more than one
 250       // thread racing to update count for the current card. It should be
 251       // OK not to use a CAS as the only penalty should be some missed
 252       // increments of the count which delays identifying the card as "hot".
 253 
 254       if (*count < max_jubyte) count_ptr->_count++;
 255       // We can defer the processing of card_ptr
 256       *defer = true;
 257       return card_ptr;
 258     }
 259 
 260     // Different card - evict old card info
 261     if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
 262     if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
 263       // Trigger a resize the next time we clear
 264       _expand_card_counts = true;
 265     }
 266 
 267     cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
 268                                        (volatile jlong*)&epoch_ptr->_value,
 269                                        (jlong) prev_epoch_entry);
 270 
 271     if (cas_res == prev_epoch_entry) {
 272       // We successfully updated the card num value in the epoch entry
 273       count_ptr->_count = 0; // initialize counter for new card num
 274       jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
 275 
 276       // Even though the region containg the card at old_card_num was not
 277       // in the young list when old_card_num was recorded in the epoch
 278       // cache it could have been added to the free list and subsequently
 279       // added to the young list in the intervening time. See CR 6817995.
 280       // We do not deal with this case here - it will be handled in
 281       // HeapRegion::oops_on_card_seq_iterate_careful after it has been
 282       // determined that the region containing the card has been allocated
 283       // to, and it's safe to check the young type of the region.
 284 
 285       // We do not want to defer processing of card_ptr in this case
 286       // (we need to refine old_card_ptr and card_ptr)
 287       *defer = false;
 288       return old_card_ptr;
 289     }
 290     // Someone else beat us - try again.
 291   }
 292 }
 293 
 294 jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
 295   int count;
 296   jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
 297   assert(cached_ptr != NULL, "bad cached card ptr");
 298 
 299   // We've just inserted a card pointer into the card count cache
 300   // and got back the card that we just inserted or (evicted) the
 301   // previous contents of that count slot.
 302 
 303   // The card we got back could be in a young region. When the
 304   // returned card (if evicted) was originally inserted, we had
 305   // determined that its containing region was not young. However
 306   // it is possible for the region to be freed during a cleanup
 307   // pause, then reallocated and tagged as young which will result
 308   // in the returned card residing in a young region.
 309   //
 310   // We do not deal with this case here - the change from non-young
 311   // to young could be observed at any time - it will be handled in
 312   // HeapRegion::oops_on_card_seq_iterate_careful after it has been
 313   // determined that the region containing the card has been allocated
 314   // to.
 315 
 316   // The card pointer we obtained from card count cache is not hot
 317   // so do not store it in the cache; return it for immediate
 318   // refining.
 319   if (count < G1ConcRSHotCardLimit) {
 320     return cached_ptr;
 321   }
 322 
 323   // Otherwise, the pointer we got from the _card_counts cache is hot.
 324   jbyte* res = NULL;
 325   MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
 326   if (_n_hot == _hot_cache_size) {
 327     res = _hot_cache[_hot_cache_idx];
 328     _n_hot--;
 329   }
 330   // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
 331   _hot_cache[_hot_cache_idx] = cached_ptr;
 332   _hot_cache_idx++;
 333   if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
 334   _n_hot++;
 335 
 336   // The card obtained from the hot card cache could be in a young
 337   // region. See above on how this can happen.
 338 
 339   return res;
 340 }
 341 
 342 void ConcurrentG1Refine::clean_up_cache(int worker_i,
 343                                         G1RemSet* g1rs,
 344                                         DirtyCardQueue* into_cset_dcq) {
 345   assert(!use_cache(), "cache should be disabled");
 346   int start_idx;
 347 
 348   while ((start_idx = _hot_cache_par_claimed_idx) < _n_hot) { // read once
 349     int end_idx = start_idx + _hot_cache_par_chunk_size;
 350 
 351     if (start_idx ==
 352         Atomic::cmpxchg(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
 353       // The current worker has successfully claimed the chunk [start_idx..end_idx)
 354       end_idx = MIN2(end_idx, _n_hot);
 355       for (int i = start_idx; i < end_idx; i++) {
 356         jbyte* entry = _hot_cache[i];
 357         if (entry != NULL) {
 358           if (g1rs->concurrentRefineOneCard(entry, worker_i, true)) {
 359             // 'entry' contains references that point into the current
 360             // collection set. We need to record 'entry' in the DCQS
 361             // that's used for that purpose.
 362             //
 363             // The only time we care about recording cards that contain
 364             // references that point into the collection set is during
 365             // RSet updating while within an evacuation pause.
 366             // In this case worker_i should be the id of a GC worker thread
 367             assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause");
 368             assert(worker_i < (int) DirtyCardQueueSet::num_par_ids(), "incorrect worker id");
 369             into_cset_dcq->enqueue(entry);
 370           }
 371         }
 372       }
 373     }
 374   }
 375 }
 376 
 377 void ConcurrentG1Refine::expand_card_count_cache() {
 378   if (_n_card_counts < _max_n_card_counts) {
 379     int new_idx = _cache_size_index+1;
 380     int new_size = _cc_cache_sizes[new_idx];
 381     if (new_size < 0) new_size = _max_n_card_counts;
 382 
 383     // Make sure we don't go bigger than we will ever need
 384     new_size = MIN2((unsigned) new_size, _max_n_card_counts);
 385 
 386     // Expand the card count and card epoch tables
 387     if (new_size > (int)_n_card_counts) {
 388       // We can just free and allocate a new array as we're
 389       // not interested in preserving the contents
 390       assert(_card_counts != NULL, "Logic!");
 391       assert(_card_epochs != NULL, "Logic!");
 392       FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
 393       FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
 394       _n_card_counts = new_size;
 395       _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
 396       _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
 397       _cache_size_index = new_idx;
 398     }
 399   }
 400 }
 401 
 402 void ConcurrentG1Refine::clear_and_record_card_counts() {
 403   if (G1ConcRSLogCacheSize == 0) return;
 404 
 405 #ifndef PRODUCT
 406   double start = os::elapsedTime();
 407 #endif
 408 
 409   if (_expand_card_counts) {
 410     expand_card_count_cache();
 411     _expand_card_counts = false;
 412     // Only need to clear the epochs.
 413     Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
 414   }
 415 
 416   int this_epoch = (int) _n_periods;
 417   assert((this_epoch+1) <= max_jint, "to many periods");
 418   // Update epoch
 419   _n_periods++;
 420 
 421 #ifndef PRODUCT
 422   double elapsed = os::elapsedTime() - start;
 423   _g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0);
 424 #endif
 425 }
 426 
 427 void ConcurrentG1Refine::print_worker_threads_on(outputStream* st) const {
 428   for (int i = 0; i < _n_threads; ++i) {
 429     _threads[i]->print_on(st);
 430     st->cr();
 431   }
 432 }