Print this page
rev 2724 : 6484965: G1: piggy-back liveness accounting phase on marking
Summary: Remove the separate counting phase of concurrent marking by tracking the amount of marked bytes and the cards spanned by marked objects in marking task/worker thread local data structures, which are updated as individual objects are marked.
Reviewed-by:

Split Close
Expand all
Collapse all
          --- old/src/share/vm/gc_implementation/g1/concurrentMark.cpp
          +++ new/src/share/vm/gc_implementation/g1/concurrentMark.cpp
↓ open down ↓ 465 lines elided ↑ open up ↑
 466  466    _parallel_marking_threads(0),
 467  467    _sleep_factor(0.0),
 468  468    _marking_task_overhead(1.0),
 469  469    _cleanup_sleep_factor(0.0),
 470  470    _cleanup_task_overhead(1.0),
 471  471    _cleanup_list("Cleanup List"),
 472  472    _region_bm(max_regions, false /* in_resource_area*/),
 473  473    _card_bm((rs.size() + CardTableModRefBS::card_size - 1) >>
 474  474             CardTableModRefBS::card_shift,
 475  475             false /* in_resource_area*/),
      476 +
 476  477    _prevMarkBitMap(&_markBitMap1),
 477  478    _nextMarkBitMap(&_markBitMap2),
 478  479    _at_least_one_mark_complete(false),
 479  480  
 480  481    _markStack(this),
 481  482    _regionStack(),
 482  483    // _finger set in set_non_marking_state
 483  484  
 484  485    _max_task_num(MAX2(ParallelGCThreads, (size_t)1)),
 485  486    // _active_tasks set in set_non_marking_state
↓ open down ↓ 9 lines elided ↑ open up ↑
 495  496    _should_gray_objects(false),
 496  497  
 497  498    // _verbose_level set below
 498  499  
 499  500    _init_times(),
 500  501    _remark_times(), _remark_mark_times(), _remark_weak_ref_times(),
 501  502    _cleanup_times(),
 502  503    _total_counting_time(0.0),
 503  504    _total_rs_scrub_time(0.0),
 504  505  
 505      -  _parallel_workers(NULL) {
      506 +  _parallel_workers(NULL),
      507 +
      508 +  _count_card_bitmaps(NULL),
      509 +  _count_marked_bytes(NULL)
      510 +{
 506  511    CMVerboseLevel verbose_level = (CMVerboseLevel) G1MarkingVerboseLevel;
 507  512    if (verbose_level < no_verbose) {
 508  513      verbose_level = no_verbose;
 509  514    }
 510  515    if (verbose_level > high_verbose) {
 511  516      verbose_level = high_verbose;
 512  517    }
 513  518    _verbose_level = verbose_level;
 514  519  
 515  520    if (verbose_low()) {
↓ open down ↓ 13 lines elided ↑ open up ↑
 529  534    assert(CGC_lock != NULL, "Where's the CGC_lock?");
 530  535    assert(_markBitMap1.covers(rs), "_markBitMap1 inconsistency");
 531  536    assert(_markBitMap2.covers(rs), "_markBitMap2 inconsistency");
 532  537  
 533  538    SATBMarkQueueSet& satb_qs = JavaThread::satb_mark_queue_set();
 534  539    satb_qs.set_buffer_size(G1SATBBufferSize);
 535  540  
 536  541    _tasks = NEW_C_HEAP_ARRAY(CMTask*, _max_task_num);
 537  542    _accum_task_vtime = NEW_C_HEAP_ARRAY(double, _max_task_num);
 538  543  
      544 +  _count_card_bitmaps = NEW_C_HEAP_ARRAY(BitMap, _max_task_num);
      545 +  _count_marked_bytes = NEW_C_HEAP_ARRAY(size_t*, _max_task_num);
      546 +
      547 +  BitMap::idx_t card_bm_size = _card_bm.size();
      548 +
 539  549    // so that the assertion in MarkingTaskQueue::task_queue doesn't fail
 540  550    _active_tasks = _max_task_num;
 541  551    for (int i = 0; i < (int) _max_task_num; ++i) {
 542  552      CMTaskQueue* task_queue = new CMTaskQueue();
 543  553      task_queue->initialize();
 544  554      _task_queues->register_queue(i, task_queue);
 545  555  
 546  556      _tasks[i] = new CMTask(i, this, task_queue, _task_queues);
 547  557      _accum_task_vtime[i] = 0.0;
      558 +
      559 +    _count_card_bitmaps[i] = BitMap(card_bm_size, false);
      560 +    _count_marked_bytes[i] = NEW_C_HEAP_ARRAY(size_t, max_regions);
 548  561    }
 549  562  
 550  563    if (ConcGCThreads > ParallelGCThreads) {
 551  564      vm_exit_during_initialization("Can't have more ConcGCThreads "
 552  565                                    "than ParallelGCThreads.");
 553  566    }
 554  567    if (ParallelGCThreads == 0) {
 555  568      // if we are not running with any parallel GC threads we will not
 556  569      // spawn any marking threads either
 557  570      _parallel_marking_threads =   0;
↓ open down ↓ 101 lines elided ↑ open up ↑
 659  672    _heap_end   = committed.end();
 660  673  
 661  674    // Separated the asserts so that we know which one fires.
 662  675    assert(_heap_start != NULL, "heap bounds should look ok");
 663  676    assert(_heap_end != NULL, "heap bounds should look ok");
 664  677    assert(_heap_start < _heap_end, "heap bounds should look ok");
 665  678  
 666  679    // reset all the marking data structures and any necessary flags
 667  680    clear_marking_state();
 668  681  
      682 +  clear_all_count_data();
      683 +
 669  684    if (verbose_low()) {
 670  685      gclog_or_tty->print_cr("[global] resetting");
 671  686    }
 672  687  
 673  688    // We do reset all of them, since different phases will use
 674  689    // different number of active threads. So, it's easiest to have all
 675  690    // of them ready.
 676  691    for (int i = 0; i < (int) _max_task_num; ++i) {
 677  692      _tasks[i]->reset(_nextMarkBitMap);
 678  693    }
↓ open down ↓ 35 lines elided ↑ open up ↑
 714  729    // not doing marking.
 715  730    clear_marking_state();
 716  731    _active_tasks = 0;
 717  732    clear_concurrent_marking_in_progress();
 718  733  }
 719  734  
 720  735  ConcurrentMark::~ConcurrentMark() {
 721  736    for (int i = 0; i < (int) _max_task_num; ++i) {
 722  737      delete _task_queues->queue(i);
 723  738      delete _tasks[i];
      739 +
      740 +    _count_card_bitmaps[i].resize(0, false);
      741 +    FREE_C_HEAP_ARRAY(size_t, _count_marked_bytes[i]);
 724  742    }
      743 +
 725  744    delete _task_queues;
 726      -  FREE_C_HEAP_ARRAY(CMTask*, _max_task_num);
      745 +  FREE_C_HEAP_ARRAY(CMTask*, _tasks);
      746 +  FREE_C_HEAP_ARRAY(double, _accum_task_vtime);
      747 +
      748 +  FREE_C_HEAP_ARRAY(BitMap*, _count_card_bitmaps);
      749 +  FREE_C_HEAP_ARRAY(size_t*, _count_marked_bytes);
 727  750  }
 728  751  
 729  752  // This closure is used to mark refs into the g1 generation
 730  753  // from external roots in the CMS bit map.
 731  754  // Called at the first checkpoint.
 732  755  //
 733  756  
 734  757  void ConcurrentMark::clearNextBitmap() {
 735  758    G1CollectedHeap* g1h = G1CollectedHeap::heap();
 736  759    G1CollectorPolicy* g1p = g1h->g1_policy();
↓ open down ↓ 198 lines elided ↑ open up ↑
 935  958  bool ForceOverflowSettings::should_force() {
 936  959    if (_force) {
 937  960      _force = false;
 938  961      return true;
 939  962    } else {
 940  963      return false;
 941  964    }
 942  965  }
 943  966  #endif // !PRODUCT
 944  967  
 945      -void ConcurrentMark::grayRoot(oop p) {
      968 +void ConcurrentMark::grayRoot(oop p, int worker_i) {
 946  969    HeapWord* addr = (HeapWord*) p;
 947  970    // We can't really check against _heap_start and _heap_end, since it
 948  971    // is possible during an evacuation pause with piggy-backed
 949  972    // initial-mark that the committed space is expanded during the
 950  973    // pause without CM observing this change. So the assertions below
 951  974    // is a bit conservative; but better than nothing.
 952  975    assert(_g1h->g1_committed().contains(addr),
 953  976           "address should be within the heap bounds");
 954  977  
 955  978    if (!_nextMarkBitMap->isMarked(addr)) {
 956      -    _nextMarkBitMap->parMark(addr);
      979 +    if (_nextMarkBitMap->parMark(addr)) {
      980 +      // Update the task specific count data for object p.
      981 +      add_to_count_data_for(p, worker_i);
      982 +    }
 957  983    }
 958  984  }
 959  985  
 960  986  void ConcurrentMark::grayRegionIfNecessary(MemRegion mr) {
 961  987    // The objects on the region have already been marked "in bulk" by
 962  988    // the caller. We only need to decide whether to push the region on
 963  989    // the region stack or not.
 964  990  
 965  991    if (!concurrent_marking_in_progress() || !_should_gray_objects) {
 966  992      // We're done with marking and waiting for remark. We do not need to
↓ open down ↓ 28 lines elided ↑ open up ↑
 995 1021      }
 996 1022  
 997 1023      if (!region_stack_push_lock_free(mr)) {
 998 1024        if (verbose_low()) {
 999 1025          gclog_or_tty->print_cr("[global] region stack has overflown.");
1000 1026        }
1001 1027      }
1002 1028    }
1003 1029  }
1004 1030  
1005      -void ConcurrentMark::markAndGrayObjectIfNecessary(oop p) {
     1031 +void ConcurrentMark::markAndGrayObjectIfNecessary(oop p, int worker_i) {
1006 1032    // The object is not marked by the caller. We need to at least mark
1007 1033    // it and maybe push in on the stack.
1008 1034  
1009 1035    HeapWord* addr = (HeapWord*)p;
1010 1036    if (!_nextMarkBitMap->isMarked(addr)) {
1011 1037      // We definitely need to mark it, irrespective whether we bail out
1012 1038      // because we're done with marking.
1013 1039      if (_nextMarkBitMap->parMark(addr)) {
     1040 +      // Update the task specific count data for object p
     1041 +      add_to_count_data_for(p, worker_i);
     1042 +      
1014 1043        if (!concurrent_marking_in_progress() || !_should_gray_objects) {
1015 1044          // If we're done with concurrent marking and we're waiting for
1016 1045          // remark, then we're not pushing anything on the stack.
1017 1046          return;
1018 1047        }
1019 1048  
1020 1049        // No OrderAccess:store_load() is needed. It is implicit in the
1021 1050        // CAS done in parMark(addr) above
1022 1051        HeapWord* finger = _finger;
1023 1052  
↓ open down ↓ 142 lines elided ↑ open up ↑
1166 1195  
1167 1196    if (has_overflown()) {
1168 1197      // Oops.  We overflowed.  Restart concurrent marking.
1169 1198      _restart_for_overflow = true;
1170 1199      // Clear the flag. We do not need it any more.
1171 1200      clear_has_overflown();
1172 1201      if (G1TraceMarkStackOverflow) {
1173 1202        gclog_or_tty->print_cr("\nRemark led to restart for overflow.");
1174 1203      }
1175 1204    } else {
     1205 +    // Aggregate the per-task counting data that we have accumulated
     1206 +    // while marking.
     1207 +    aggregate_all_count_data();
     1208 +
1176 1209      SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
1177 1210      // We're done with marking.
1178 1211      // This is the end of  the marking cycle, we're expected all
1179 1212      // threads to have SATB queues with active set to true.
1180 1213      satb_mq_set.set_active_all_threads(false, /* new active value */
1181 1214                                         true /* expected_active */);
1182 1215  
1183 1216      if (VerifyDuringGC) {
1184 1217        HandleMark hm;  // handle scope
1185 1218        gclog_or_tty->print(" VerifyDuringGC:(after)");
↓ open down ↓ 19 lines elided ↑ open up ↑
1205 1238    double now = os::elapsedTime();
1206 1239    _remark_mark_times.add((mark_work_end - start) * 1000.0);
1207 1240    _remark_weak_ref_times.add((now - mark_work_end) * 1000.0);
1208 1241    _remark_times.add((now - start) * 1000.0);
1209 1242  
1210 1243    g1p->record_concurrent_mark_remark_end();
1211 1244  }
1212 1245  
1213 1246  #define CARD_BM_TEST_MODE 0
1214 1247  
     1248 +// Used to calculate the # live objects per region
     1249 +// for verification purposes
1215 1250  class CalcLiveObjectsClosure: public HeapRegionClosure {
1216 1251  
1217 1252    CMBitMapRO* _bm;
1218 1253    ConcurrentMark* _cm;
1219      -  bool _changed;
1220      -  bool _yield;
1221      -  size_t _words_done;
     1254 +  BitMap* _region_bm;
     1255 +  BitMap* _card_bm;
     1256 +
     1257 +  size_t _tot_words_done;
1222 1258    size_t _tot_live;
1223 1259    size_t _tot_used;
1224      -  size_t _regions_done;
1225      -  double _start_vtime_sec;
1226 1260  
1227      -  BitMap* _region_bm;
1228      -  BitMap* _card_bm;
     1261 +  size_t _region_marked_bytes;
     1262 +
1229 1263    intptr_t _bottom_card_num;
1230      -  bool _final;
1231 1264  
1232 1265    void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) {
1233      -    for (intptr_t i = start_card_num; i <= last_card_num; i++) {
     1266 +    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
     1267 +    BitMap::idx_t last_idx = last_card_num - _bottom_card_num;
     1268 +    
     1269 +    for (BitMap::idx_t i = start_idx; i <= last_idx; i += 1) {
1234 1270  #if CARD_BM_TEST_MODE
1235      -      guarantee(_card_bm->at(i - _bottom_card_num), "Should already be set.");
     1271 +      guarantee(_card_bm->at(i), "Should already be set.");
1236 1272  #else
1237      -      _card_bm->par_at_put(i - _bottom_card_num, 1);
     1273 +      _card_bm->par_at_put(i, 1);
1238 1274  #endif
1239 1275      }
1240 1276    }
1241 1277  
1242 1278  public:
1243      -  CalcLiveObjectsClosure(bool final,
1244      -                         CMBitMapRO *bm, ConcurrentMark *cm,
     1279 +  CalcLiveObjectsClosure(CMBitMapRO *bm, ConcurrentMark *cm,
1245 1280                           BitMap* region_bm, BitMap* card_bm) :
1246      -    _bm(bm), _cm(cm), _changed(false), _yield(true),
1247      -    _words_done(0), _tot_live(0), _tot_used(0),
1248      -    _region_bm(region_bm), _card_bm(card_bm),_final(final),
1249      -    _regions_done(0), _start_vtime_sec(0.0)
     1281 +    _bm(bm), _cm(cm), _region_bm(region_bm), _card_bm(card_bm),
     1282 +    _region_marked_bytes(0), _tot_words_done(0),
     1283 +    _tot_live(0), _tot_used(0)
1250 1284    {
1251 1285      _bottom_card_num =
1252 1286        intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
1253 1287                 CardTableModRefBS::card_shift);
1254 1288    }
1255 1289  
1256 1290    // It takes a region that's not empty (i.e., it has at least one
1257 1291    // live object in it and sets its corresponding bit on the region
1258 1292    // bitmap to 1. If the region is "starts humongous" it will also set
1259 1293    // to 1 the bits on the region bitmap that correspond to its
↓ open down ↓ 20 lines elided ↑ open up ↑
1280 1314          HeapRegion* chr = g1h->region_at(end_index);
1281 1315          if (!chr->continuesHumongous()) break;
1282 1316          end_index += 1;
1283 1317        }
1284 1318        _region_bm->par_at_put_range((BitMap::idx_t) index,
1285 1319                                     (BitMap::idx_t) end_index, true);
1286 1320      }
1287 1321    }
1288 1322  
1289 1323    bool doHeapRegion(HeapRegion* hr) {
1290      -    if (!_final && _regions_done == 0) {
1291      -      _start_vtime_sec = os::elapsedVTime();
1292      -    }
1293 1324  
1294 1325      if (hr->continuesHumongous()) {
1295 1326        // We will ignore these here and process them when their
1296 1327        // associated "starts humongous" region is processed (see
1297 1328        // set_bit_for_heap_region()). Note that we cannot rely on their
1298 1329        // associated "starts humongous" region to have their bit set to
1299 1330        // 1 since, due to the region chunking in the parallel region
1300 1331        // iteration, a "continues humongous" region might be visited
1301 1332        // before its associated "starts humongous".
1302 1333        return false;
1303 1334      }
1304 1335  
1305 1336      HeapWord* nextTop = hr->next_top_at_mark_start();
1306      -    HeapWord* start   = hr->top_at_conc_mark_count();
1307      -    assert(hr->bottom() <= start && start <= hr->end() &&
1308      -           hr->bottom() <= nextTop && nextTop <= hr->end() &&
1309      -           start <= nextTop,
1310      -           "Preconditions.");
1311      -    // Otherwise, record the number of word's we'll examine.
     1337 +    HeapWord* start   = hr->bottom();
     1338 +
     1339 +    assert(start <= hr->end() && start <= nextTop && nextTop <= hr->end(),
     1340 +                "Preconditions.");
     1341 +
     1342 +    // Record the number of word's we'll examine.
1312 1343      size_t words_done = (nextTop - start);
     1344 +
1313 1345      // Find the first marked object at or after "start".
1314 1346      start = _bm->getNextMarkedWordAddress(start, nextTop);
     1347 +
1315 1348      size_t marked_bytes = 0;
     1349 +    _region_marked_bytes = 0;
1316 1350  
1317 1351      // Below, the term "card num" means the result of shifting an address
1318 1352      // by the card shift -- address 0 corresponds to card number 0.  One
1319 1353      // must subtract the card num of the bottom of the heap to obtain a
1320 1354      // card table index.
     1355 +
1321 1356      // The first card num of the sequence of live cards currently being
1322 1357      // constructed.  -1 ==> no sequence.
1323 1358      intptr_t start_card_num = -1;
     1359 +
1324 1360      // The last card num of the sequence of live cards currently being
1325 1361      // constructed.  -1 ==> no sequence.
1326 1362      intptr_t last_card_num = -1;
1327 1363  
1328 1364      while (start < nextTop) {
1329      -      if (_yield && _cm->do_yield_check()) {
1330      -        // We yielded.  It might be for a full collection, in which case
1331      -        // all bets are off; terminate the traversal.
1332      -        if (_cm->has_aborted()) {
1333      -          _changed = false;
1334      -          return true;
1335      -        } else {
1336      -          // Otherwise, it might be a collection pause, and the region
1337      -          // we're looking at might be in the collection set.  We'll
1338      -          // abandon this region.
1339      -          return false;
1340      -        }
1341      -      }
1342 1365        oop obj = oop(start);
1343 1366        int obj_sz = obj->size();
     1367 +
1344 1368        // The card num of the start of the current object.
1345 1369        intptr_t obj_card_num =
1346 1370          intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
1347      -
1348 1371        HeapWord* obj_last = start + obj_sz - 1;
1349 1372        intptr_t obj_last_card_num =
1350 1373          intptr_t(uintptr_t(obj_last) >> CardTableModRefBS::card_shift);
1351 1374  
1352 1375        if (obj_card_num != last_card_num) {
1353 1376          if (start_card_num == -1) {
1354 1377            assert(last_card_num == -1, "Both or neither.");
1355 1378            start_card_num = obj_card_num;
1356 1379          } else {
1357 1380            assert(last_card_num != -1, "Both or neither.");
↓ open down ↓ 12 lines elided ↑ open up ↑
1370 1393          */
1371 1394          for (intptr_t j = obj_card_num; j <= obj_last_card_num; j++) {
1372 1395            _card_bm->par_at_put(j - _bottom_card_num, 1);
1373 1396          }
1374 1397  #endif
1375 1398        }
1376 1399        // In any case, we set the last card num.
1377 1400        last_card_num = obj_last_card_num;
1378 1401  
1379 1402        marked_bytes += (size_t)obj_sz * HeapWordSize;
     1403 +
1380 1404        // Find the next marked object after this one.
1381 1405        start = _bm->getNextMarkedWordAddress(start + 1, nextTop);
1382      -      _changed = true;
1383 1406      }
     1407 +
1384 1408      // Handle the last range, if any.
1385 1409      if (start_card_num != -1) {
1386 1410        mark_card_num_range(start_card_num, last_card_num);
1387 1411      }
1388      -    if (_final) {
1389      -      // Mark the allocated-since-marking portion...
1390      -      HeapWord* tp = hr->top();
1391      -      if (nextTop < tp) {
1392      -        start_card_num =
1393      -          intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift);
1394      -        last_card_num =
1395      -          intptr_t(uintptr_t(tp) >> CardTableModRefBS::card_shift);
1396      -        mark_card_num_range(start_card_num, last_card_num);
1397      -        // This definitely means the region has live objects.
1398      -        set_bit_for_region(hr);
1399      -      }
     1412 +
     1413 +    // Mark the allocated-since-marking portion...
     1414 +    HeapWord* top = hr->top();
     1415 +    if (nextTop < top) {
     1416 +      start_card_num = intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift);
     1417 +      last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift);
     1418 +
     1419 +      mark_card_num_range(start_card_num, last_card_num);
     1420 +      
     1421 +      // This definitely means the region has live objects.
     1422 +      set_bit_for_region(hr);
1400 1423      }
1401 1424  
1402      -    hr->add_to_marked_bytes(marked_bytes);
1403 1425      // Update the live region bitmap.
1404 1426      if (marked_bytes > 0) {
1405 1427        set_bit_for_region(hr);
1406 1428      }
1407      -    hr->set_top_at_conc_mark_count(nextTop);
     1429 +
     1430 +    // Set the marked bytes for the current region so that
     1431 +    // it can be queried by a calling verificiation routine
     1432 +    _region_marked_bytes = marked_bytes;
     1433 +
1408 1434      _tot_live += hr->next_live_bytes();
1409 1435      _tot_used += hr->used();
1410      -    _words_done = words_done;
     1436 +    _tot_words_done = words_done;
1411 1437  
1412      -    if (!_final) {
1413      -      ++_regions_done;
1414      -      if (_regions_done % 10 == 0) {
1415      -        double end_vtime_sec = os::elapsedVTime();
1416      -        double elapsed_vtime_sec = end_vtime_sec - _start_vtime_sec;
1417      -        if (elapsed_vtime_sec > (10.0 / 1000.0)) {
1418      -          jlong sleep_time_ms =
1419      -            (jlong) (elapsed_vtime_sec * _cm->cleanup_sleep_factor() * 1000.0);
1420      -          os::sleep(Thread::current(), sleep_time_ms, false);
1421      -          _start_vtime_sec = end_vtime_sec;
1422      -        }
     1438 +    return false;
     1439 +  }
     1440 +
     1441 +  size_t region_marked_bytes() const { return _region_marked_bytes; }
     1442 +  size_t tot_words_done() const      { return _tot_words_done; }
     1443 +  size_t tot_live() const            { return _tot_live; }
     1444 +  size_t tot_used() const            { return _tot_used; }
     1445 +};
     1446 +
     1447 +// Aggregate the counting data that was constructed concurrently
     1448 +// with marking.
     1449 +class AddToMarkedBytesClosure: public HeapRegionClosure {
     1450 +  ConcurrentMark* _cm;
     1451 +  size_t _task_num;
     1452 +  size_t _max_task_num;
     1453 +
     1454 +  bool _final;
     1455 +
     1456 +public:
     1457 +  AddToMarkedBytesClosure(ConcurrentMark *cm,
     1458 +                          size_t task_num,
     1459 +                          size_t max_task_num) :
     1460 +    _cm(cm),
     1461 +    _task_num(task_num),
     1462 +    _max_task_num(max_task_num),
     1463 +    _final(false)
     1464 +  {
     1465 +    assert(0 <= _task_num && _task_num < _max_task_num, "sanity");
     1466 +    if ((_max_task_num - _task_num) == 1) {
     1467 +      // Last task
     1468 +      _final = true;
     1469 +    }
     1470 +  }
     1471 +
     1472 +  bool doHeapRegion(HeapRegion* hr) {
     1473 +    // Adds the value in the counted marked bytes array for
     1474 +    // _task_num for region hr to the value cached in heap
     1475 +    // region itself.
     1476 +    // For the final task we also set the top at conc count
     1477 +    // for the region.
     1478 +    // The bits in the live region bitmap are set for regions
     1479 +    // that contain live data during the cleanup pause.
     1480 +
     1481 +    if (hr->continuesHumongous()) {
     1482 +      // We will ignore these here and process them when their
     1483 +      // associated "starts humongous" region is processed.
     1484 +      // Note that we cannot rely on their associated
     1485 +      // "starts humongous" region to have their bit set to 1
     1486 +      // since, due to the region chunking in the parallel region
     1487 +      // iteration, a "continues humongous" region might be visited
     1488 +      // before its associated "starts humongous".
     1489 +      return false;
     1490 +    }
     1491 +
     1492 +    int hrs_index = hr->hrs_index();
     1493 +    size_t* marked_bytes_array = _cm->count_marked_bytes_for(_task_num);
     1494 +    size_t marked_bytes = marked_bytes_array[hrs_index];
     1495 +    hr->add_to_marked_bytes(marked_bytes);
     1496 +
     1497 +    if (_final) {
     1498 +      HeapWord* ntams = hr->next_top_at_mark_start();
     1499 +      HeapWord* start = hr->bottom();
     1500 +      
     1501 +      assert(start <= ntams && ntams <= hr->top() && hr->top() <= hr->end(),
     1502 +             "Preconditions.");
     1503 +
     1504 +      hr->set_top_at_conc_mark_count(ntams);
     1505 +    }
     1506 +
     1507 +    return false;
     1508 +  }
     1509 +};
     1510 +
     1511 +void ConcurrentMark::aggregate_all_count_data() {
     1512 +  _card_bm.clear();
     1513 +
     1514 +  // Unions the per task card bitmaps into the global card bitmap,
     1515 +  // and aggregates the per task marked bytes for each region into
     1516 +  // the heap region itself.
     1517 +
     1518 +  for (int i = 0; i < _max_task_num; i += 1) {
     1519 +    BitMap& task_card_bm = count_card_bitmap_for(i);
     1520 +    _card_bm.set_union(task_card_bm);
     1521 +
     1522 +    // Update the marked bytes for each region
     1523 +    AddToMarkedBytesClosure cl(this, i, _max_task_num);
     1524 +    _g1h->heap_region_iterate(&cl);
     1525 +  }
     1526 +
     1527 +  // We're done with the accumulated per-task concurrent
     1528 +  // counting data so let's clear it for the next marking.
     1529 +  clear_all_count_data();
     1530 +}
     1531 +
     1532 +// Final update of count data (during cleanup).
     1533 +// Adds [top_at_count, NTAMS) to the marked bytes for each
     1534 +// region. Sets the bits in the card bitmap corresponding
     1535 +// to the interval [top_at_count, top], and sets the
     1536 +// liveness bit for each region containing live data
     1537 +// in the region bitmap.
     1538 +
     1539 +class FinalCountDataUpdateClosure: public HeapRegionClosure {
     1540 +  ConcurrentMark* _cm;
     1541 +  BitMap* _region_bm;
     1542 +  BitMap* _card_bm;
     1543 +  intptr_t _bottom_card_num;
     1544 +
     1545 +  size_t _total_live_bytes;
     1546 +  size_t _total_used_bytes;
     1547 +  size_t _total_words_done;
     1548 +
     1549 +  void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) {
     1550 +    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
     1551 +    BitMap::idx_t last_idx = last_card_num - _bottom_card_num;
     1552 +    
     1553 +    // Inclusive bit range [start_idx, last_idx]. par_at_put_range
     1554 +    // is exclusive so we have to also set the bit for last_idx.
     1555 +    // Passing last_idx+1 to the clear_range would work in
     1556 +    // most cases but could trip an OOB assertion.
     1557 +
     1558 +    if ((last_idx - start_idx) > 0) {
     1559 +      _card_bm->par_at_put_range(start_idx, last_idx, true);
     1560 +    }
     1561 +    _card_bm->par_set_bit(last_idx);
     1562 +  }
     1563 +
     1564 +  // It takes a region that's not empty (i.e., it has at least one
     1565 +  // live object in it and sets its corresponding bit on the region
     1566 +  // bitmap to 1. If the region is "starts humongous" it will also set
     1567 +  // to 1 the bits on the region bitmap that correspond to its
     1568 +  // associated "continues humongous" regions.
     1569 +  void set_bit_for_region(HeapRegion* hr) {
     1570 +    assert(!hr->continuesHumongous(), "should have filtered those out");
     1571 +
     1572 +    size_t index = hr->hrs_index();
     1573 +    if (!hr->startsHumongous()) {
     1574 +      // Normal (non-humongous) case: just set the bit.
     1575 +      _region_bm->par_set_bit((BitMap::idx_t) index);
     1576 +    } else {
     1577 +      // Starts humongous case: calculate how many regions are part of
     1578 +      // this humongous region and then set the bit range. It might
     1579 +      // have been a bit more efficient to look at the object that
     1580 +      // spans these humongous regions to calculate their number from
     1581 +      // the object's size. However, it's a good idea to calculate
     1582 +      // this based on the metadata itself, and not the region
     1583 +      // contents, so that this code is not aware of what goes into
     1584 +      // the humongous regions (in case this changes in the future).
     1585 +      G1CollectedHeap* g1h = G1CollectedHeap::heap();
     1586 +      size_t end_index = index + 1;
     1587 +      while (end_index < g1h->n_regions()) {
     1588 +        HeapRegion* chr = g1h->region_at(end_index);
     1589 +        if (!chr->continuesHumongous()) break;
     1590 +        end_index += 1;
1423 1591        }
     1592 +      _region_bm->par_at_put_range((BitMap::idx_t) index,
     1593 +                                   (BitMap::idx_t) end_index, true);
1424 1594      }
     1595 +  }
     1596 +
     1597 + public:
     1598 +  FinalCountDataUpdateClosure(ConcurrentMark* cm,
     1599 +                              BitMap* region_bm,
     1600 +                              BitMap* card_bm) :
     1601 +    _cm(cm), _region_bm(region_bm), _card_bm(card_bm),
     1602 +    _total_words_done(0), _total_live_bytes(0), _total_used_bytes(0)
     1603 +  {
     1604 +    _bottom_card_num =
     1605 +      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
     1606 +               CardTableModRefBS::card_shift);
     1607 +  }
     1608 +
     1609 +  bool doHeapRegion(HeapRegion* hr) {
     1610 +
     1611 +    if (hr->continuesHumongous()) {
     1612 +      // We will ignore these here and process them when their
     1613 +      // associated "starts humongous" region is processed (see
     1614 +      // set_bit_for_heap_region()). Note that we cannot rely on their
     1615 +      // associated "starts humongous" region to have their bit set to
     1616 +      // 1 since, due to the region chunking in the parallel region
     1617 +      // iteration, a "continues humongous" region might be visited
     1618 +      // before its associated "starts humongous".
     1619 +      return false;
     1620 +    }
     1621 +
     1622 +    HeapWord* start = hr->top_at_conc_mark_count();
     1623 +    HeapWord* ntams = hr->next_top_at_mark_start();
     1624 +    HeapWord* top   = hr->top();
     1625 +    
     1626 +    assert(hr->bottom() <= start && start <= hr->end() &&
     1627 +           hr->bottom() <= ntams && ntams <= hr->end(), "Preconditions.");
     1628 +    
     1629 +    size_t words_done = ntams - hr->bottom();
     1630 +
     1631 +    intptr_t start_card_num = intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
     1632 +    intptr_t last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift);
     1633 +
     1634 +
     1635 +    if (start < ntams) {
     1636 +      // Region was changed between remark and cleanup pauses
     1637 +      // We need to add (ntams - start) to the marked bytes
     1638 +      // for this region, and set bits for the range
     1639 +      // [ card_num(start), card_num(ntams) ) in the
     1640 +      // card bitmap.
     1641 +      size_t live_bytes = (ntams - start) * HeapWordSize;
     1642 +      hr->add_to_marked_bytes(live_bytes);
     1643 +      
     1644 +      // Record the new top at conc count
     1645 +      hr->set_top_at_conc_mark_count(ntams);
     1646 +
     1647 +      // The setting of the bits card bitmap takes place below
     1648 +    }
     1649 +
     1650 +    // Mark the allocated-since-marking portion...
     1651 +    if (ntams < top) {
     1652 +      // This definitely means the region has live objects.
     1653 +      set_bit_for_region(hr);
     1654 +    }
     1655 +
     1656 +    // Now set the bits for [start, top]
     1657 +    mark_card_num_range(start_card_num, last_card_num);
     1658 +
     1659 +    // Set the bit for the region if it contains live data
     1660 +    if (hr->next_marked_bytes() > 0) {
     1661 +      set_bit_for_region(hr);
     1662 +    }
     1663 +
     1664 +    _total_words_done += words_done;
     1665 +    _total_used_bytes += hr->used();
     1666 +    _total_live_bytes += hr->next_marked_bytes();
1425 1667  
1426 1668      return false;
1427 1669    }
1428 1670  
1429      -  bool changed() { return _changed;  }
1430      -  void reset()   { _changed = false; _words_done = 0; }
1431      -  void no_yield() { _yield = false; }
1432      -  size_t words_done() { return _words_done; }
1433      -  size_t tot_live() { return _tot_live; }
1434      -  size_t tot_used() { return _tot_used; }
     1671 +  size_t total_words_done() const { return _total_words_done; }
     1672 +  size_t total_live_bytes() const { return _total_live_bytes; }
     1673 +  size_t total_used_bytes() const { return _total_used_bytes; }
1435 1674  };
1436 1675  
     1676 +// Heap region closure used for verifying the counting data
     1677 +// that was accumulated concurrently and aggregated during
     1678 +// the remark pause. This closure is applied to the heap
     1679 +// regions during the STW cleanup pause.
1437 1680  
1438      -void ConcurrentMark::calcDesiredRegions() {
1439      -  _region_bm.clear();
1440      -  _card_bm.clear();
1441      -  CalcLiveObjectsClosure calccl(false /*final*/,
1442      -                                nextMarkBitMap(), this,
1443      -                                &_region_bm, &_card_bm);
1444      -  G1CollectedHeap *g1h = G1CollectedHeap::heap();
1445      -  g1h->heap_region_iterate(&calccl);
     1681 +class VerifyLiveObjectDataHRClosure: public HeapRegionClosure {
     1682 +  ConcurrentMark* _cm;
     1683 +  CalcLiveObjectsClosure _calc_cl;
     1684 +  BitMap* _region_bm;   // Region BM to be verified
     1685 +  BitMap* _card_bm;     // Card BM to be verified
     1686 +  bool _verbose;        // verbose output?
1446 1687  
1447      -  do {
1448      -    calccl.reset();
1449      -    g1h->heap_region_iterate(&calccl);
1450      -  } while (calccl.changed());
1451      -}
     1688 +  BitMap* _exp_region_bm; // Expected Region BM values
     1689 +  BitMap* _exp_card_bm;   // Expected card BM values
     1690 +
     1691 +  intptr_t _bottom_card_num; // Used for calculatint bitmap indices
     1692 +
     1693 +  int _failures;
     1694 +
     1695 +public:
     1696 +  VerifyLiveObjectDataHRClosure(ConcurrentMark* cm,
     1697 +                                BitMap* region_bm,
     1698 +                                BitMap* card_bm,
     1699 +                                BitMap* exp_region_bm,
     1700 +                                BitMap* exp_card_bm,
     1701 +                                bool verbose) :
     1702 +    _cm(cm),
     1703 +    _calc_cl(_cm->nextMarkBitMap(), _cm, exp_region_bm, exp_card_bm),
     1704 +    _region_bm(region_bm), _card_bm(card_bm), _verbose(verbose),
     1705 +    _exp_region_bm(exp_region_bm), _exp_card_bm(exp_card_bm),
     1706 +    _failures(0)
     1707 +  { 
     1708 +    _bottom_card_num =
     1709 +      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
     1710 +               CardTableModRefBS::card_shift);
     1711 +  }
     1712 +
     1713 +  int failures() const { return _failures; }
     1714 +
     1715 +  bool doHeapRegion(HeapRegion* hr) {
     1716 +    if (hr->continuesHumongous()) {
     1717 +      // We will ignore these here and process them when their
     1718 +      // associated "starts humongous" region is processed (see
     1719 +      // set_bit_for_heap_region()). Note that we cannot rely on their
     1720 +      // associated "starts humongous" region to have their bit set to
     1721 +      // 1 since, due to the region chunking in the parallel region
     1722 +      // iteration, a "continues humongous" region might be visited
     1723 +      // before its associated "starts humongous".
     1724 +      return false;
     1725 +    }
     1726 +
     1727 +    // Call the CalcLiveObjectsClosure to walk the marking bitmap for
     1728 +    // this region and set the corresponding bits in the expected region
     1729 +    // and card bitmaps.
     1730 +    bool res = _calc_cl.doHeapRegion(hr);
     1731 +    assert(res == false, "should be continuing");
     1732 +
     1733 +    // Note that the calculated count data could be a subset of the
     1734 +    // count data that was accumlated during marking. See the comment
     1735 +    // in G1ParCopyHelper::copy_to_survivor space for an explanation
     1736 +    // why.
     1737 +
     1738 +    if (_verbose) {
     1739 +      gclog_or_tty->print("Region %d: bottom: "PTR_FORMAT", ntams: "
     1740 +                          PTR_FORMAT", top: "PTR_FORMAT", end: "PTR_FORMAT,
     1741 +                          hr->hrs_index(), hr->bottom(), hr->next_top_at_mark_start(),
     1742 +                          hr->top(), hr->end());
     1743 +      gclog_or_tty->print_cr(", marked_bytes: calc/actual "SIZE_FORMAT"/"SIZE_FORMAT,
     1744 +                             _calc_cl.region_marked_bytes(),
     1745 +                             hr->next_marked_bytes());
     1746 +    }
     1747 +
     1748 +    // Verify that _top_at_conc_count == ntams
     1749 +    if (hr->top_at_conc_mark_count() != hr->next_top_at_mark_start()) {
     1750 +      if (_verbose) {
     1751 +        gclog_or_tty->print_cr("Region %d: top at conc count incorrect: expected "
     1752 +                               PTR_FORMAT", actual: "PTR_FORMAT,
     1753 +                               hr->hrs_index(), hr->next_top_at_mark_start(),
     1754 +                               hr->top_at_conc_mark_count());
     1755 +      }
     1756 +      _failures += 1;
     1757 +    }
     1758 +
     1759 +    // Verify the marked bytes for this region. 
     1760 +    size_t exp_marked_bytes = _calc_cl.region_marked_bytes();
     1761 +    size_t act_marked_bytes = hr->next_marked_bytes();
     1762 +
     1763 +    // We're OK if actual marked bytes >= expected.
     1764 +    if (exp_marked_bytes > act_marked_bytes) {
     1765 +      if (_verbose) {
     1766 +        gclog_or_tty->print_cr("Region %d: marked bytes mismatch: expected: "
     1767 +                               SIZE_FORMAT", actual: "SIZE_FORMAT,
     1768 +                               hr->hrs_index(), exp_marked_bytes, act_marked_bytes);
     1769 +      }
     1770 +      _failures += 1;
     1771 +    }
     1772 +
     1773 +    // Verify the bit, for this region, in the actual and expected
     1774 +    // (which was just calculated) region bit maps.
     1775 +    // We're not OK if the expected bit is set and the actual is not set.
     1776 +    BitMap::idx_t index = (BitMap::idx_t)hr->hrs_index();
     1777 +    
     1778 +    bool expected = _exp_region_bm->at(index);
     1779 +    bool actual = _region_bm->at(index);
     1780 +    if (expected && !actual) {
     1781 +      if (_verbose) {
     1782 +        gclog_or_tty->print_cr("Region %d: region bitmap mismatch: expected: %d, actual: %d",
     1783 +                               hr->hrs_index(), expected, actual);
     1784 +      }
     1785 +      _failures += 1;
     1786 +    }
     1787 +
     1788 +    // Verify that the card bit maps for the cards spanned by the current
     1789 +    // region match. The set of offsets that have set bits in the expected
     1790 +    // bitmap should be a subset of the offsets with set bits from the actual
     1791 +    // calculated card bitmap.
     1792 +    // Again it's more important that if the expected bit is set then the
     1793 +    // actual bit be set.
     1794 +    intptr_t start_card_num =
     1795 +        intptr_t(uintptr_t(hr->bottom()) >> CardTableModRefBS::card_shift);
     1796 +    intptr_t top_card_num =
     1797 +        intptr_t(uintptr_t(hr->top()) >> CardTableModRefBS::card_shift);
     1798 +
     1799 +    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
     1800 +    BitMap::idx_t end_idx = top_card_num - _bottom_card_num;
     1801 +
     1802 +    for (BitMap::idx_t i = start_idx; i < end_idx; i+=1) {
     1803 +      expected = _exp_card_bm->at(i);
     1804 +      actual = _card_bm->at(i);
     1805 +      
     1806 +      if (expected && !actual) {
     1807 +        if (_verbose) {
     1808 +          gclog_or_tty->print_cr("Region %d: card bitmap mismatch at idx %d: expected: %d, actual: %d",
     1809 +                                 hr->hrs_index(), i, expected, actual);
     1810 +        }
     1811 +        _failures += 1;
     1812 +      }
     1813 +    }
     1814 +    if (_failures) {
     1815 +      // Stop iteration?
     1816 +      return true;
     1817 +    }
     1818 +
     1819 +    return false;
     1820 +  }
     1821 +};
     1822 +
     1823 +class Mux2HRClosure: public HeapRegionClosure {
     1824 +  HeapRegionClosure* _cl1;
     1825 +  HeapRegionClosure* _cl2;
     1826 +
     1827 +public:
     1828 +  Mux2HRClosure(HeapRegionClosure *c1, HeapRegionClosure *c2) : _cl1(c1), _cl2(c2) { }
     1829 +  bool doHeapRegion(HeapRegion* hr) {
     1830 +    bool res1 = _cl1->doHeapRegion(hr);
     1831 +    bool res2 = _cl2->doHeapRegion(hr);
     1832 +
     1833 +    // Only continue if both return false;
     1834 +    return res1 || res2;
     1835 +  }
     1836 +};
1452 1837  
1453 1838  class G1ParFinalCountTask: public AbstractGangTask {
1454 1839  protected:
1455 1840    G1CollectedHeap* _g1h;
1456 1841    CMBitMap* _bm;
1457 1842    size_t _n_workers;
1458 1843    size_t *_live_bytes;
1459 1844    size_t *_used_bytes;
1460      -  BitMap* _region_bm;
1461      -  BitMap* _card_bm;
     1845 +
     1846 +  BitMap* _actual_region_bm;
     1847 +  BitMap* _actual_card_bm;
     1848 +
     1849 +  BitMap _expected_region_bm;
     1850 +  BitMap _expected_card_bm;
     1851 +
     1852 +  int _failures;
     1853 +
1462 1854  public:
1463 1855    G1ParFinalCountTask(G1CollectedHeap* g1h, CMBitMap* bm,
1464 1856                        BitMap* region_bm, BitMap* card_bm)
1465      -    : AbstractGangTask("G1 final counting"), _g1h(g1h),
1466      -      _bm(bm), _region_bm(region_bm), _card_bm(card_bm) {
     1857 +    : AbstractGangTask("G1 final counting"),
     1858 +      _g1h(g1h), _bm(bm),
     1859 +      _actual_region_bm(region_bm), _actual_card_bm(card_bm),
     1860 +      _expected_region_bm(0, false), _expected_card_bm(0, false),
     1861 +      _failures(0)
     1862 +  {
1467 1863      if (ParallelGCThreads > 0) {
1468 1864        _n_workers = _g1h->workers()->total_workers();
1469 1865      } else {
1470 1866        _n_workers = 1;
1471 1867      }
     1868 +
1472 1869      _live_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers);
1473 1870      _used_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers);
     1871 +
     1872 +    if (VerifyDuringGC) {
     1873 +      _expected_card_bm.resize(_actual_card_bm->size(), false);
     1874 +      _expected_region_bm.resize(_actual_region_bm->size(), false);
     1875 +    }
1474 1876    }
1475 1877  
1476 1878    ~G1ParFinalCountTask() {
     1879 +    if (VerifyDuringGC) {
     1880 +      _expected_region_bm.resize(0);
     1881 +      _expected_card_bm.resize(0);
     1882 +    }
1477 1883      FREE_C_HEAP_ARRAY(size_t, _live_bytes);
1478 1884      FREE_C_HEAP_ARRAY(size_t, _used_bytes);
1479 1885    }
1480 1886  
1481 1887    void work(int i) {
1482      -    CalcLiveObjectsClosure calccl(true /*final*/,
1483      -                                  _bm, _g1h->concurrent_mark(),
1484      -                                  _region_bm, _card_bm);
1485      -    calccl.no_yield();
     1888 +
     1889 +    FinalCountDataUpdateClosure final_update_cl(_g1h->concurrent_mark(),
     1890 +                                                _actual_region_bm, _actual_card_bm);
     1891 +
     1892 +    VerifyLiveObjectDataHRClosure verify_cl(_g1h->concurrent_mark(),
     1893 +                                            _actual_region_bm, _actual_card_bm,
     1894 +                                            &_expected_region_bm,
     1895 +                                            &_expected_card_bm,
     1896 +                                            true /* verbose */);
     1897 +
     1898 +    Mux2HRClosure update_and_verify_cl(&final_update_cl, &verify_cl);
     1899 +
     1900 +    HeapRegionClosure* hr_cl = &final_update_cl;
     1901 +    if (VerifyDuringGC) {
     1902 +      hr_cl = &update_and_verify_cl;
     1903 +    }
     1904 +
1486 1905      if (G1CollectedHeap::use_parallel_gc_threads()) {
1487      -      _g1h->heap_region_par_iterate_chunked(&calccl, i,
     1906 +      _g1h->heap_region_par_iterate_chunked(hr_cl, i,
1488 1907                                              HeapRegion::FinalCountClaimValue);
1489 1908      } else {
1490      -      _g1h->heap_region_iterate(&calccl);
     1909 +      _g1h->heap_region_iterate(hr_cl);
1491 1910      }
1492      -    assert(calccl.complete(), "Shouldn't have yielded!");
1493 1911  
1494 1912      assert((size_t) i < _n_workers, "invariant");
1495      -    _live_bytes[i] = calccl.tot_live();
1496      -    _used_bytes[i] = calccl.tot_used();
     1913 +    _live_bytes[i] = final_update_cl.total_live_bytes();
     1914 +    _used_bytes[i] = final_update_cl.total_used_bytes();
     1915 +
     1916 +    if (VerifyDuringGC) {
     1917 +      _failures += verify_cl.failures();
     1918 +    }
1497 1919    }
     1920 +
1498 1921    size_t live_bytes()  {
1499 1922      size_t live_bytes = 0;
1500 1923      for (size_t i = 0; i < _n_workers; ++i)
1501 1924        live_bytes += _live_bytes[i];
1502 1925      return live_bytes;
1503 1926    }
     1927 +
1504 1928    size_t used_bytes()  {
1505 1929      size_t used_bytes = 0;
1506 1930      for (size_t i = 0; i < _n_workers; ++i)
1507 1931        used_bytes += _used_bytes[i];
1508 1932      return used_bytes;
1509 1933    }
     1934 +
     1935 +  int failures() const { return _failures; }
1510 1936  };
1511 1937  
1512 1938  class G1ParNoteEndTask;
1513 1939  
1514 1940  class G1NoteEndOfConcMarkClosure : public HeapRegionClosure {
1515 1941    G1CollectedHeap* _g1;
1516 1942    int _worker_num;
1517 1943    size_t _max_live_bytes;
1518 1944    size_t _regions_claimed;
1519 1945    size_t _freed_bytes;
↓ open down ↓ 180 lines elided ↑ open up ↑
1700 2126                       /* option      */ VerifyOption_G1UsePrevMarking);
1701 2127    }
1702 2128  
1703 2129    G1CollectorPolicy* g1p = G1CollectedHeap::heap()->g1_policy();
1704 2130    g1p->record_concurrent_mark_cleanup_start();
1705 2131  
1706 2132    double start = os::elapsedTime();
1707 2133  
1708 2134    HeapRegionRemSet::reset_for_cleanup_tasks();
1709 2135  
     2136 +  // Clear the global region bitmap - it will be filled as part
     2137 +  // of the final counting task.
     2138 +  _region_bm.clear();
     2139 +
1710 2140    // Do counting once more with the world stopped for good measure.
1711 2141    G1ParFinalCountTask g1_par_count_task(g1h, nextMarkBitMap(),
1712 2142                                          &_region_bm, &_card_bm);
     2143 +
1713 2144    if (G1CollectedHeap::use_parallel_gc_threads()) {
1714      -    assert(g1h->check_heap_region_claim_values(
1715      -                                               HeapRegion::InitialClaimValue),
     2145 +    assert(g1h->check_heap_region_claim_values(HeapRegion::InitialClaimValue),
1716 2146             "sanity check");
1717 2147  
1718 2148      int n_workers = g1h->workers()->total_workers();
1719 2149      g1h->set_par_threads(n_workers);
1720 2150      g1h->workers()->run_task(&g1_par_count_task);
1721 2151      g1h->set_par_threads(0);
1722 2152  
1723      -    assert(g1h->check_heap_region_claim_values(
1724      -                                             HeapRegion::FinalCountClaimValue),
     2153 +    assert(g1h->check_heap_region_claim_values(HeapRegion::FinalCountClaimValue),
1725 2154             "sanity check");
1726 2155    } else {
1727 2156      g1_par_count_task.work(0);
1728 2157    }
1729 2158  
     2159 +  // Verify that there were no verification failures of
     2160 +  // the live counting data.
     2161 +  if (VerifyDuringGC) {
     2162 +    assert(g1_par_count_task.failures() == 0, "Unexpected failures");
     2163 +  }
     2164 +
1730 2165    size_t known_garbage_bytes =
1731 2166      g1_par_count_task.used_bytes() - g1_par_count_task.live_bytes();
1732 2167    g1p->set_known_garbage_bytes(known_garbage_bytes);
1733 2168  
1734 2169    size_t start_used_bytes = g1h->used();
1735 2170    _at_least_one_mark_complete = true;
1736 2171    g1h->set_marking_complete();
1737 2172  
1738 2173    ergo_verbose4(ErgoConcCycles,
1739 2174             "finish cleanup",
↓ open down ↓ 182 lines elided ↑ open up ↑
1922 2357  }
1923 2358  
1924 2359  class G1CMKeepAliveClosure: public OopClosure {
1925 2360    G1CollectedHeap* _g1;
1926 2361    ConcurrentMark*  _cm;
1927 2362    CMBitMap*        _bitMap;
1928 2363   public:
1929 2364    G1CMKeepAliveClosure(G1CollectedHeap* g1, ConcurrentMark* cm,
1930 2365                         CMBitMap* bitMap) :
1931 2366      _g1(g1), _cm(cm),
1932      -    _bitMap(bitMap) {}
     2367 +    _bitMap(bitMap)
     2368 +  {
     2369 +    assert(Thread::current()->is_VM_thread(), "otherwise fix worker id");
     2370 +  }
1933 2371  
1934 2372    virtual void do_oop(narrowOop* p) { do_oop_work(p); }
1935 2373    virtual void do_oop(      oop* p) { do_oop_work(p); }
1936 2374  
1937 2375    template <class T> void do_oop_work(T* p) {
1938 2376      oop obj = oopDesc::load_decode_heap_oop(p);
1939 2377      HeapWord* addr = (HeapWord*)obj;
1940 2378  
1941 2379      if (_cm->verbose_high()) {
1942 2380        gclog_or_tty->print_cr("\t[0] we're looking at location "
1943 2381                               "*"PTR_FORMAT" = "PTR_FORMAT,
1944 2382                               p, (void*) obj);
1945 2383      }
1946 2384  
1947 2385      if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) {
1948 2386        _bitMap->mark(addr);
     2387 +      // Update the task specific count data for obj
     2388 +      _cm->add_to_count_data_for(obj, 0 /* worker_i */);
     2389 +
1949 2390        _cm->mark_stack_push(obj);
1950 2391      }
1951 2392    }
1952 2393  };
1953 2394  
1954 2395  class G1CMDrainMarkingStackClosure: public VoidClosure {
1955 2396    CMMarkStack*                  _markStack;
1956 2397    CMBitMap*                     _bitMap;
1957 2398    G1CMKeepAliveClosure*         _oopClosure;
1958 2399   public:
↓ open down ↓ 630 lines elided ↑ open up ↑
2589 3030        // anything with it).
2590 3031        HeapRegion* hr = _g1h->heap_region_containing_raw(obj);
2591 3032        if (!hr->obj_allocated_since_next_marking(obj)) {
2592 3033          if (verbose_high()) {
2593 3034            gclog_or_tty->print_cr("[global] "PTR_FORMAT" is not considered "
2594 3035                                   "marked", (void*) obj);
2595 3036          }
2596 3037  
2597 3038          // we need to mark it first
2598 3039          if (_nextMarkBitMap->parMark(objAddr)) {
     3040 +          // Update the task specific count data for obj
     3041 +          add_to_count_data_for(obj, hr, 0 /* worker_i */);
     3042 +
2599 3043            // No OrderAccess:store_load() is needed. It is implicit in the
2600 3044            // CAS done in parMark(objAddr) above
2601 3045            HeapWord* finger = _finger;
2602 3046            if (objAddr < finger) {
2603 3047              if (verbose_high()) {
2604 3048                gclog_or_tty->print_cr("[global] below the global finger "
2605 3049                                       "("PTR_FORMAT"), pushing it", finger);
2606 3050              }
2607 3051              if (!mark_stack_push(obj)) {
2608 3052                if (verbose_low()) {
↓ open down ↓ 226 lines elided ↑ open up ↑
2835 3279    _finger = _heap_start;
2836 3280  
2837 3281    for (int i = 0; i < (int)_max_task_num; ++i) {
2838 3282      OopTaskQueue* queue = _task_queues->queue(i);
2839 3283      queue->set_empty();
2840 3284      // Clear any partial regions from the CMTasks
2841 3285      _tasks[i]->clear_aborted_region();
2842 3286    }
2843 3287  }
2844 3288  
     3289 +// Clear the per-worker arrays used to store the per-region counting data
     3290 +void ConcurrentMark::clear_all_count_data() {
     3291 +  assert(SafepointSynchronize::is_at_safepoint() ||
     3292 +         !Universe::is_fully_initialized(), "must be");
     3293 +
     3294 +  int max_regions = _g1h->max_regions();
     3295 +  
     3296 +  assert(_max_task_num != 0, "unitialized");
     3297 +  assert(_count_card_bitmaps != NULL, "uninitialized");
     3298 +  assert(_count_marked_bytes != NULL, "uninitialized");
     3299 +
     3300 +  for (int i = 0; i < _max_task_num; i += 1) {
     3301 +    BitMap& task_card_bm = count_card_bitmap_for(i);
     3302 +    size_t* marked_bytes_array = count_marked_bytes_for(i);
     3303 +
     3304 +    assert(task_card_bm.size() == _card_bm.size(), "size mismatch");
     3305 +    assert(marked_bytes_array != NULL, "uninitialized");
     3306 +
     3307 +    for (int j = 0; j < max_regions; j++) {
     3308 +      marked_bytes_array[j] = 0;
     3309 +    }
     3310 +    task_card_bm.clear();
     3311 +  }
     3312 +}
     3313 +
     3314 +// Adds the given region to the counting data structures
     3315 +// for the given task id.
     3316 +void ConcurrentMark::add_to_count_data_for(MemRegion mr,
     3317 +                                           HeapRegion* hr,
     3318 +                                           int worker_i) {
     3319 +  G1CollectedHeap* g1h = _g1h;
     3320 +  HeapWord* start = mr.start();
     3321 +  HeapWord* last = mr.last();
     3322 +  size_t index = hr->hrs_index();
     3323 +
     3324 +  assert(!hr->continuesHumongous(), "should not be HC region");
     3325 +  assert(hr == g1h->heap_region_containing(start), "sanity");
     3326 +  assert(hr == g1h->heap_region_containing(mr.last()), "sanity");
     3327 +  assert(0 <= worker_i && worker_i < _max_task_num, "oob");
     3328 +
     3329 +  BitMap& task_card_bm = count_card_bitmap_for(worker_i);
     3330 +  size_t* marked_bytes_array = count_marked_bytes_for(worker_i);
     3331 +
     3332 +  // Below, the term "card num" means the result of shifting an address
     3333 +  // by the card shift -- address 0 corresponds to card number 0.  One
     3334 +  // must subtract the card num of the bottom of the heap to obtain a
     3335 +  // card table index.
     3336 +
     3337 +  intptr_t start_card_num = 
     3338 +    intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
     3339 +  intptr_t last_card_num =
     3340 +    intptr_t(uintptr_t(last) >> CardTableModRefBS::card_shift);
     3341 +
     3342 +  intptr_t bottom_card_num = 
     3343 +    intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> 
     3344 +        CardTableModRefBS::card_shift);
     3345 +
     3346 +  BitMap::idx_t start_idx = start_card_num - bottom_card_num;
     3347 +  BitMap::idx_t last_idx = last_card_num - bottom_card_num;
     3348 +  
     3349 +  // The card bitmap is task/worker specific => no need to use 'par' routines.
     3350 +  // Inclusive bit range [start_idx, last_idx]. set_range is exclusive
     3351 +  // so we have to also explicitly set the bit for last_idx.
     3352 +  // Passing last_idx+1 to the clear_range would work in most cases
     3353 +  // but could trip an OOB assertion.
     3354 +
     3355 +  if ((last_idx - start_idx) > 0) {
     3356 +    task_card_bm.set_range(start_idx, last_idx);
     3357 +  }
     3358 +  task_card_bm.set_bit(last_idx);
     3359 +  
     3360 +  // Add to the task local marked bytes for this region.
     3361 +  marked_bytes_array[index] += mr.byte_size();
     3362 +}
     3363 +
     3364 +void ConcurrentMark::add_to_count_data_for(oop obj, HeapRegion* hr, int worker_i) {
     3365 +  MemRegion mr((HeapWord*)obj, obj->size());
     3366 +  add_to_count_data_for(mr, hr, worker_i);
     3367 +}
     3368 +
     3369 +void ConcurrentMark::add_to_count_data_for(MemRegion mr, int worker_i) {
     3370 +  HeapRegion* hr = _g1h->heap_region_containing(mr.start());
     3371 +  add_to_count_data_for(mr, hr, worker_i);
     3372 +}
     3373 +
     3374 +void ConcurrentMark::add_to_count_data_for(oop obj, int worker_i) {
     3375 +  MemRegion mr((HeapWord*)obj, obj->size());
     3376 +  add_to_count_data_for(mr, worker_i);
     3377 +}
     3378 +
     3379 +// Updates the counting data with liveness info recorded for a
     3380 +// region (typically a GCLab).
     3381 +void ConcurrentMark::add_to_count_data_for_region(MemRegion lab_mr,
     3382 +                                                  BitMap* lab_card_bm,
     3383 +                                                  intptr_t lab_bottom_card_num,
     3384 +                                                  size_t lab_marked_bytes,
     3385 +                                                  int worker_i) {
     3386 +  HeapRegion* hr = _g1h->heap_region_containing(lab_mr.start());
     3387 +
     3388 +  BitMap& task_card_bm = count_card_bitmap_for(worker_i);
     3389 +  size_t* marked_bytes_array = count_marked_bytes_for(worker_i);
     3390 +
     3391 +  // Below, the term "card num" means the result of shifting an address
     3392 +  // by the card shift -- address 0 corresponds to card number 0.  One
     3393 +  // must subtract the card num of the bottom of the heap to obtain a
     3394 +  // card table index.
     3395 +  
     3396 +  intptr_t heap_bottom_card_num = 
     3397 +    intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> 
     3398 +        CardTableModRefBS::card_shift);
     3399 +
     3400 +  assert(intptr_t(uintptr_t(lab_mr.start()) >> CardTableModRefBS::card_shift) == lab_bottom_card_num,
     3401 +         "sanity");
     3402 +
     3403 +  // We have to map the indices of set bits in lab_card_bm, using
     3404 +  // lab_bottom_card_num, to indices the card bitmap for the given task.
     3405 +
     3406 +  BitMap::idx_t end_idx = lab_card_bm->size();
     3407 +  BitMap::idx_t start_idx = lab_card_bm->get_next_one_offset(0, end_idx);
     3408 +  while (start_idx < end_idx) {
     3409 +    assert(lab_card_bm->at(start_idx), "should be set");
     3410 +
     3411 +    intptr_t lab_card_num = lab_bottom_card_num + start_idx;
     3412 +    BitMap::idx_t card_bm_idx = lab_card_num - heap_bottom_card_num;
     3413 +
     3414 +    task_card_bm.set_bit(card_bm_idx);
     3415 +
     3416 +    // Get the offset of the next set bit
     3417 +    start_idx = lab_card_bm->get_next_one_offset(start_idx+1, end_idx);
     3418 +  }
     3419 +
     3420 +  // Now add to the marked bytes
     3421 +  marked_bytes_array[hr->hrs_index()] += lab_marked_bytes;
     3422 +}
     3423 +
     3424 +void ConcurrentMark::clear_count_data_for_heap_region(HeapRegion* hr) {
     3425 +  // Clears the count data for the given region from _all_ of
     3426 +  // the per-task counting data structures.
     3427 +
     3428 +  MemRegion used_region = hr->used_region();
     3429 +  HeapWord* start = used_region.start();
     3430 +  HeapWord* last = used_region.last();
     3431 +  size_t hr_index = hr->hrs_index();
     3432 +
     3433 +  intptr_t bottom_card_num =
     3434 +      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
     3435 +               CardTableModRefBS::card_shift);
     3436 +  
     3437 +  intptr_t start_card_num =
     3438 +    intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
     3439 +  intptr_t last_card_num =
     3440 +    intptr_t(uintptr_t(last) >> CardTableModRefBS::card_shift);
     3441 +  
     3442 +  BitMap::idx_t start_idx = start_card_num - bottom_card_num;
     3443 +  BitMap::idx_t last_idx = last_card_num - bottom_card_num;
     3444 +
     3445 +  size_t used_region_bytes = used_region.byte_size();
     3446 +  size_t marked_bytes = 0;
     3447 +
     3448 +  for (int i=0; i < _max_task_num; i += 1) {
     3449 +    BitMap& task_card_bm = count_card_bitmap_for(i);
     3450 +    size_t* marked_bytes_array = count_marked_bytes_for(i);
     3451 +
     3452 +    marked_bytes += marked_bytes_array[hr_index];
     3453 +    // clear the amount of marked bytes in the task array for this
     3454 +    // region
     3455 +    marked_bytes_array[hr_index] = 0;
     3456 +    
     3457 +    // Clear the inclusive range [start_idx, last_idx] from the
     3458 +    // card bitmap. The clear_range routine is exclusive so we
     3459 +    // need to also explicitly clear the bit at last_idx.
     3460 +    // Passing last_idx+1 to the clear_range would work in
     3461 +    // most cases but could trip an OOB assertion.
     3462 +
     3463 +    if ((last_idx - start_idx) > 0) {
     3464 +      task_card_bm.clear_range(start_idx, last_idx);
     3465 +    }
     3466 +    task_card_bm.clear_bit(last_idx);
     3467 +  }
     3468 +  // We could assert here that marked_bytes == used_region_bytes
     3469 +}
     3470 +
2845 3471  void ConcurrentMark::print_stats() {
2846 3472    if (verbose_stats()) {
2847 3473      gclog_or_tty->print_cr("---------------------------------------------------------------------");
2848 3474      for (size_t i = 0; i < _active_tasks; ++i) {
2849 3475        _tasks[i]->print_stats();
2850 3476        gclog_or_tty->print_cr("---------------------------------------------------------------------");
2851 3477      }
2852 3478    }
2853 3479  }
2854 3480  
↓ open down ↓ 85 lines elided ↑ open up ↑
2940 3566        // that it's marked.  So follow the forwarding pointer.  Note that
2941 3567        // this does the right thing for self-forwarding pointers in the
2942 3568        // evacuation failure case.
2943 3569        obj = obj->forwardee();
2944 3570      }
2945 3571      HeapRegion* hr = _g1h->heap_region_containing(obj);
2946 3572      if (hr != NULL) {
2947 3573        if (hr->in_collection_set()) {
2948 3574          if (_g1h->is_obj_ill(obj)) {
2949 3575            _bm->mark((HeapWord*)obj);
     3576 +          // Update the task specific count data for object
     3577 +          _cm->add_to_count_data_for(obj, hr, 0 /* worker_i */);
     3578 +
2950 3579            if (!push(obj)) {
2951 3580              gclog_or_tty->print_cr("Setting abort in CSMarkOopClosure because push failed.");
2952 3581              set_abort();
2953 3582            }
2954 3583          }
2955 3584        } else {
2956 3585          // Outside the collection set; we need to gray it
2957 3586          _cm->deal_with_reference(obj);
2958 3587        }
2959 3588      }
↓ open down ↓ 61 lines elided ↑ open up ↑
3021 3650          }
3022 3651        }
3023 3652      }
3024 3653      return false;
3025 3654    }
3026 3655  
3027 3656    bool completed() { return _completed; }
3028 3657  };
3029 3658  
3030 3659  class ClearMarksInHRClosure: public HeapRegionClosure {
     3660 +  ConcurrentMark* _cm;
3031 3661    CMBitMap* _bm;
3032 3662  public:
3033      -  ClearMarksInHRClosure(CMBitMap* bm): _bm(bm) { }
     3663 +  ClearMarksInHRClosure(ConcurrentMark* cm, CMBitMap* bm):
     3664 +    _cm(cm), _bm(bm)
     3665 +  { }
3034 3666  
3035 3667    bool doHeapRegion(HeapRegion* r) {
3036 3668      if (!r->used_region().is_empty() && !r->evacuation_failed()) {
3037      -      MemRegion usedMR = r->used_region();
3038 3669        _bm->clearRange(r->used_region());
     3670 +      // Need to remove values from the count info
     3671 +      _cm->clear_count_data_for_heap_region(r);
3039 3672      }
3040 3673      return false;
3041 3674    }
3042 3675  };
3043 3676  
3044 3677  void ConcurrentMark::complete_marking_in_collection_set() {
3045 3678    G1CollectedHeap* g1h =  G1CollectedHeap::heap();
3046 3679  
3047 3680    if (!g1h->mark_in_progress()) {
3048 3681      g1h->g1_policy()->record_mark_closure_time(0.0);
↓ open down ↓ 5 lines elided ↑ open up ↑
3054 3687    while (true) {
3055 3688      i++;
3056 3689      CompleteMarkingInCSHRClosure cmplt(this);
3057 3690      g1h->collection_set_iterate(&cmplt);
3058 3691      if (cmplt.completed()) break;
3059 3692    }
3060 3693    double end_time = os::elapsedTime();
3061 3694    double elapsed_time_ms = (end_time - start) * 1000.0;
3062 3695    g1h->g1_policy()->record_mark_closure_time(elapsed_time_ms);
3063 3696  
3064      -  ClearMarksInHRClosure clr(nextMarkBitMap());
     3697 +  ClearMarksInHRClosure clr(this, nextMarkBitMap());
3065 3698    g1h->collection_set_iterate(&clr);
3066 3699  }
3067 3700  
3068 3701  // The next two methods deal with the following optimisation. Some
3069 3702  // objects are gray by being marked and located above the finger. If
3070 3703  // they are copied, during an evacuation pause, below the finger then
3071 3704  // the need to be pushed on the stack. The observation is that, if
3072 3705  // there are no regions in the collection set located above the
3073 3706  // finger, then the above cannot happen, hence we do not need to
3074 3707  // explicitly gray any objects when copying them to below the
↓ open down ↓ 121 lines elided ↑ open up ↑
3196 3829      gclog_or_tty->print_cr("    RS scrub total time = %8.2f s (avg = %8.2f ms).",
3197 3830                             _total_rs_scrub_time,
3198 3831                             (_cleanup_times.num() > 0 ? _total_rs_scrub_time * 1000.0 /
3199 3832                              (double)_cleanup_times.num()
3200 3833                             : 0.0));
3201 3834    }
3202 3835    gclog_or_tty->print_cr("  Total stop_world time = %8.2f s.",
3203 3836                           (_init_times.sum() + _remark_times.sum() +
3204 3837                            _cleanup_times.sum())/1000.0);
3205 3838    gclog_or_tty->print_cr("  Total concurrent time = %8.2f s "
3206      -                "(%8.2f s marking, %8.2f s counting).",
     3839 +                "(%8.2f s marking).",
3207 3840                  cmThread()->vtime_accum(),
3208      -                cmThread()->vtime_mark_accum(),
3209      -                cmThread()->vtime_count_accum());
     3841 +                cmThread()->vtime_mark_accum());
3210 3842  }
3211 3843  
3212 3844  void ConcurrentMark::print_worker_threads_on(outputStream* st) const {
3213 3845    _parallel_workers->print_worker_threads_on(st);
3214 3846  }
3215 3847  
3216 3848  // Closures
3217 3849  // XXX: there seems to be a lot of code  duplication here;
3218 3850  // should refactor and consolidate the shared code.
3219 3851  
↓ open down ↓ 1483 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX