24 25 #include "precompiled.hpp" 26 #include "gc/shared/cardTableModRefBS.hpp" 27 #include "gc/shared/cardTableRS.hpp" 28 #include "gc/shared/collectedHeap.hpp" 29 #include "gc/shared/genCollectedHeap.hpp" 30 #include "gc/shared/space.inline.hpp" 31 #include "memory/allocation.inline.hpp" 32 #include "memory/virtualspace.hpp" 33 #include "oops/oop.inline.hpp" 34 #include "runtime/java.hpp" 35 #include "runtime/mutexLocker.hpp" 36 #include "runtime/orderAccess.inline.hpp" 37 #include "runtime/vmThread.hpp" 38 39 void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr, 40 OopsInGenClosure* cl, 41 CardTableRS* ct, 42 uint n_threads) { 43 assert(n_threads > 0, "Error: expected n_threads > 0"); 44 assert(n_threads <= (uint)ParallelGCThreads, "# worker threads != # requested!"); 45 46 // Make sure the LNC array is valid for the space. 47 jbyte** lowest_non_clean; 48 uintptr_t lowest_non_clean_base_chunk_index; 49 size_t lowest_non_clean_chunk_size; 50 get_LNC_array_for_space(sp, lowest_non_clean, 51 lowest_non_clean_base_chunk_index, 52 lowest_non_clean_chunk_size); 53 54 uint n_strides = n_threads * ParGCStridesPerThread; 55 SequentialSubTasksDone* pst = sp->par_seq_tasks(); 56 // Sets the condition for completion of the subtask (how many threads 57 // need to finish in order to be done). 58 pst->set_n_threads(n_threads); 59 pst->set_n_tasks(n_strides); 60 61 bool parallel = n_threads > 0; 62 63 uint stride = 0; 64 while (!pst->is_task_claimed(/* reference */ stride)) { 65 process_stride(sp, mr, stride, n_strides, 66 parallel, 67 cl, ct, 68 lowest_non_clean, 69 lowest_non_clean_base_chunk_index, 70 lowest_non_clean_chunk_size); 71 } 72 if (pst->all_tasks_completed()) { 73 // Clear lowest_non_clean array for next time. 74 intptr_t first_chunk_index = addr_to_chunk_index(mr.start()); 75 uintptr_t last_chunk_index = addr_to_chunk_index(mr.last()); 76 for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) { 77 intptr_t ind = ch - lowest_non_clean_base_chunk_index; 78 assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size, 79 "Bounds error"); 80 lowest_non_clean[ind] = NULL; 81 } 82 } 83 } 84 85 void 86 CardTableModRefBS:: 87 process_stride(Space* sp, 88 MemRegion used, 89 jint stride, int n_strides, 90 bool parallel, 91 OopsInGenClosure* cl, 92 CardTableRS* ct, 93 jbyte** lowest_non_clean, 94 uintptr_t lowest_non_clean_base_chunk_index, 95 size_t lowest_non_clean_chunk_size) { 96 // We go from higher to lower addresses here; it wouldn't help that much 97 // because of the strided parallelism pattern used here. 98 99 // Find the first card address of the first chunk in the stride that is 100 // at least "bottom" of the used region. 101 jbyte* start_card = byte_for(used.start()); 102 jbyte* end_card = byte_after(used.last()); 103 uintptr_t start_chunk = addr_to_chunk_index(used.start()); 104 uintptr_t start_chunk_stride_num = start_chunk % n_strides; 105 jbyte* chunk_card_start; 106 107 if ((uintptr_t)stride >= start_chunk_stride_num) { 108 chunk_card_start = (jbyte*)(start_card + 109 (stride - start_chunk_stride_num) * 110 ParGCCardsPerStrideChunk); 113 chunk_card_start = (jbyte*)(start_card + 114 (n_strides - start_chunk_stride_num + stride) * 115 ParGCCardsPerStrideChunk); 116 } 117 118 while (chunk_card_start < end_card) { 119 // Even though we go from lower to higher addresses below, the 120 // strided parallelism can interleave the actual processing of the 121 // dirty pages in various ways. For a specific chunk within this 122 // stride, we take care to avoid double scanning or missing a card 123 // by suitably initializing the "min_done" field in process_chunk_boundaries() 124 // below, together with the dirty region extension accomplished in 125 // DirtyCardToOopClosure::do_MemRegion(). 126 jbyte* chunk_card_end = chunk_card_start + ParGCCardsPerStrideChunk; 127 // Invariant: chunk_mr should be fully contained within the "used" region. 128 MemRegion chunk_mr = MemRegion(addr_for(chunk_card_start), 129 chunk_card_end >= end_card ? 130 used.end() : addr_for(chunk_card_end)); 131 assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)"); 132 assert(used.contains(chunk_mr), "chunk_mr should be subset of used"); 133 134 DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(), 135 cl->gen_boundary(), 136 parallel); 137 ClearNoncleanCardWrapper clear_cl(dcto_cl, ct, parallel); 138 139 140 // Process the chunk. 141 process_chunk_boundaries(sp, 142 dcto_cl, 143 chunk_mr, 144 used, 145 lowest_non_clean, 146 lowest_non_clean_base_chunk_index, 147 lowest_non_clean_chunk_size); 148 149 // We want the LNC array updates above in process_chunk_boundaries 150 // to be visible before any of the card table value changes as a 151 // result of the dirty card iteration below. 152 OrderAccess::storestore(); | 24 25 #include "precompiled.hpp" 26 #include "gc/shared/cardTableModRefBS.hpp" 27 #include "gc/shared/cardTableRS.hpp" 28 #include "gc/shared/collectedHeap.hpp" 29 #include "gc/shared/genCollectedHeap.hpp" 30 #include "gc/shared/space.inline.hpp" 31 #include "memory/allocation.inline.hpp" 32 #include "memory/virtualspace.hpp" 33 #include "oops/oop.inline.hpp" 34 #include "runtime/java.hpp" 35 #include "runtime/mutexLocker.hpp" 36 #include "runtime/orderAccess.inline.hpp" 37 #include "runtime/vmThread.hpp" 38 39 void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr, 40 OopsInGenClosure* cl, 41 CardTableRS* ct, 42 uint n_threads) { 43 assert(n_threads > 0, "Error: expected n_threads > 0"); 44 assert(n_threads <= (uint)ParallelGCThreads, 45 err_msg("Error: n_threads: %u > ParallelGCThreads: %u", n_threads, (uint)ParallelGCThreads)); 46 47 // Make sure the LNC array is valid for the space. 48 jbyte** lowest_non_clean; 49 uintptr_t lowest_non_clean_base_chunk_index; 50 size_t lowest_non_clean_chunk_size; 51 get_LNC_array_for_space(sp, lowest_non_clean, 52 lowest_non_clean_base_chunk_index, 53 lowest_non_clean_chunk_size); 54 55 uint n_strides = n_threads * ParGCStridesPerThread; 56 SequentialSubTasksDone* pst = sp->par_seq_tasks(); 57 // Sets the condition for completion of the subtask (how many threads 58 // need to finish in order to be done). 59 pst->set_n_threads(n_threads); 60 pst->set_n_tasks(n_strides); 61 62 uint stride = 0; 63 while (!pst->is_task_claimed(/* reference */ stride)) { 64 process_stride(sp, mr, stride, n_strides, 65 cl, ct, 66 lowest_non_clean, 67 lowest_non_clean_base_chunk_index, 68 lowest_non_clean_chunk_size); 69 } 70 if (pst->all_tasks_completed()) { 71 // Clear lowest_non_clean array for next time. 72 intptr_t first_chunk_index = addr_to_chunk_index(mr.start()); 73 uintptr_t last_chunk_index = addr_to_chunk_index(mr.last()); 74 for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) { 75 intptr_t ind = ch - lowest_non_clean_base_chunk_index; 76 assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size, 77 "Bounds error"); 78 lowest_non_clean[ind] = NULL; 79 } 80 } 81 } 82 83 void 84 CardTableModRefBS:: 85 process_stride(Space* sp, 86 MemRegion used, 87 jint stride, int n_strides, 88 OopsInGenClosure* cl, 89 CardTableRS* ct, 90 jbyte** lowest_non_clean, 91 uintptr_t lowest_non_clean_base_chunk_index, 92 size_t lowest_non_clean_chunk_size) { 93 // We go from higher to lower addresses here; it wouldn't help that much 94 // because of the strided parallelism pattern used here. 95 96 // Find the first card address of the first chunk in the stride that is 97 // at least "bottom" of the used region. 98 jbyte* start_card = byte_for(used.start()); 99 jbyte* end_card = byte_after(used.last()); 100 uintptr_t start_chunk = addr_to_chunk_index(used.start()); 101 uintptr_t start_chunk_stride_num = start_chunk % n_strides; 102 jbyte* chunk_card_start; 103 104 if ((uintptr_t)stride >= start_chunk_stride_num) { 105 chunk_card_start = (jbyte*)(start_card + 106 (stride - start_chunk_stride_num) * 107 ParGCCardsPerStrideChunk); 110 chunk_card_start = (jbyte*)(start_card + 111 (n_strides - start_chunk_stride_num + stride) * 112 ParGCCardsPerStrideChunk); 113 } 114 115 while (chunk_card_start < end_card) { 116 // Even though we go from lower to higher addresses below, the 117 // strided parallelism can interleave the actual processing of the 118 // dirty pages in various ways. For a specific chunk within this 119 // stride, we take care to avoid double scanning or missing a card 120 // by suitably initializing the "min_done" field in process_chunk_boundaries() 121 // below, together with the dirty region extension accomplished in 122 // DirtyCardToOopClosure::do_MemRegion(). 123 jbyte* chunk_card_end = chunk_card_start + ParGCCardsPerStrideChunk; 124 // Invariant: chunk_mr should be fully contained within the "used" region. 125 MemRegion chunk_mr = MemRegion(addr_for(chunk_card_start), 126 chunk_card_end >= end_card ? 127 used.end() : addr_for(chunk_card_end)); 128 assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)"); 129 assert(used.contains(chunk_mr), "chunk_mr should be subset of used"); 130 131 // This function is used by the parallel card table iteration. 132 const bool parallel = true; 133 134 DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(), 135 cl->gen_boundary(), 136 parallel); 137 ClearNoncleanCardWrapper clear_cl(dcto_cl, ct, parallel); 138 139 140 // Process the chunk. 141 process_chunk_boundaries(sp, 142 dcto_cl, 143 chunk_mr, 144 used, 145 lowest_non_clean, 146 lowest_non_clean_base_chunk_index, 147 lowest_non_clean_chunk_size); 148 149 // We want the LNC array updates above in process_chunk_boundaries 150 // to be visible before any of the card table value changes as a 151 // result of the dirty card iteration below. 152 OrderAccess::storestore(); |