1 /* 2 * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved. 3 * 4 * This code is free software; you can redistribute it and/or modify it 5 * under the terms of the GNU General Public License version 2 only, as 6 * published by the Free Software Foundation. 7 * 8 * This code is distributed in the hope that it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 * version 2 for more details (a copy is included in the LICENSE file that 12 * accompanied this code). 13 * 14 * You should have received a copy of the GNU General Public License version 15 * 2 along with this work; if not, write to the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 * 18 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 * or visit www.oracle.com if you need additional information or have any 20 * questions. 21 * 22 */ 23 24 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 25 #define SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 26 #include "gc/shared/owstTaskTerminator.hpp" 27 #include "gc/shared/taskqueue.hpp" 28 #include "memory/allocation.hpp" 29 #include "runtime/atomic.hpp" 30 #include "runtime/mutex.hpp" 31 #include "runtime/thread.hpp" 32 33 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 34 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N> 35 { 36 public: 37 typedef OverflowTaskQueue<E, F, N> taskqueue_t; 38 39 BufferedOverflowTaskQueue() : _buf_empty(true) {}; 40 41 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 42 43 // Push task t into the queue. Returns true on success. 44 inline bool push(E t); 45 46 // Attempt to pop from the queue. Returns true on success. 47 inline bool pop(E &t); 48 49 inline void clear(); 50 51 inline bool is_empty() const { 52 return _buf_empty && taskqueue_t::is_empty(); 53 } 54 55 private: 56 bool _buf_empty; 57 E _elem; 58 }; 59 60 // ObjArrayChunkedTask 61 // 62 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing. 63 // The design goal is to make the regular oop ops very fast, because that would be the prevailing 64 // case. On the other hand, it should not block parallel array processing from efficiently dividing 65 // the array work. 66 // 67 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the 68 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the 69 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed 70 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode 71 // all possible arrays. 72 // 73 // |---------oop---------|-pow-|--chunk---| 74 // 0 49 54 64 75 // 76 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1. 77 // 78 // This encoding gives a few interesting benefits: 79 // 80 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task: 81 // 82 // |---------oop---------|00000|0000000000| // no chunk data 83 // 84 // This helps the most ubiquitous path. The initialization amounts to putting the oop into the word 85 // with zero padding. Testing for "chunkedness" is testing for zero with chunk mask. 86 // 87 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers 88 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks: 89 // <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) ) 90 // <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) ) 91 // 92 // Observe that the union of these two intervals is: 93 // [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) ) 94 // 95 // ...which is the original interval: 96 // [ (C-1)*2^P; C*2^P ) 97 // 98 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split 99 // down in the parallel threads, which alleviates the upfront (serial) splitting costs. 100 // 101 // Encoding limitations caused by current bitscales mean: 102 // 10 bits for chunk: max 1024 blocks per array 103 // 5 bits for power: max 2^32 array 104 // 49 bits for oop: max 512 TB of addressable space 105 // 106 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits 107 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled. 108 // In future, these might be rebalanced to favor one degree of freedom against another. For example, 109 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain 110 // some bits back if chunks are counted in ObjArrayMarkingStride units. 111 // 112 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the 113 // bits from the native pointer. It is useful to debug the optimized version. 114 // 115 116 #ifdef _MSC_VER 117 #pragma warning(push) 118 // warning C4522: multiple assignment operators specified 119 #pragma warning( disable:4522 ) 120 #endif 121 122 #ifdef _LP64 123 #define SHENANDOAH_OPTIMIZED_OBJTASK 1 124 #else 125 #define SHENANDOAH_OPTIMIZED_OBJTASK 0 126 #endif 127 128 #if SHENANDOAH_OPTIMIZED_OBJTASK 129 class ObjArrayChunkedTask 130 { 131 public: 132 enum { 133 chunk_bits = 10, 134 pow_bits = 5, 135 oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits 136 }; 137 enum { 138 oop_shift = 0, 139 pow_shift = oop_shift + oop_bits, 140 chunk_shift = pow_shift + pow_bits 141 }; 142 143 public: 144 ObjArrayChunkedTask(oop o = NULL) { 145 assert(decode_oop(encode_oop(o)) == o, "oop can be encoded: " PTR_FORMAT, p2i(o)); 146 _obj = encode_oop(o); 147 } 148 ObjArrayChunkedTask(oop o, int chunk, int pow) { 149 assert(decode_oop(encode_oop(o)) == o, "oop can be encoded: " PTR_FORMAT, p2i(o)); 150 assert(decode_chunk(encode_chunk(chunk)) == chunk, "chunk can be encoded: %d", chunk); 151 assert(decode_pow(encode_pow(pow)) == pow, "pow can be encoded: %d", pow); 152 _obj = encode_oop(o) | encode_chunk(chunk) | encode_pow(pow); 153 } 154 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { } 155 156 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { 157 _obj = t._obj; 158 return *this; 159 } 160 volatile ObjArrayChunkedTask& 161 operator =(const volatile ObjArrayChunkedTask& t) volatile { 162 (void)const_cast<uintptr_t&>(_obj = t._obj); 163 return *this; 164 } 165 166 inline oop decode_oop(uintptr_t val) const { 167 return (oop) reinterpret_cast<void*>((val >> oop_shift) & right_n_bits(oop_bits)); 168 } 169 170 inline int decode_chunk(uintptr_t val) const { 171 return (int) ((val >> chunk_shift) & right_n_bits(chunk_bits)); 172 } 173 174 inline int decode_pow(uintptr_t val) const { 175 return (int) ((val >> pow_shift) & right_n_bits(pow_bits)); 176 } 177 178 inline uintptr_t encode_oop(oop obj) const { 179 return ((uintptr_t)(void*) obj) << oop_shift; 180 } 181 182 inline uintptr_t encode_chunk(int chunk) const { 183 return ((uintptr_t) chunk) << chunk_shift; 184 } 185 186 inline uintptr_t encode_pow(int pow) const { 187 return ((uintptr_t) pow) << pow_shift; 188 } 189 190 inline oop obj() const { return decode_oop(_obj); } 191 inline int chunk() const { return decode_chunk(_obj); } 192 inline int pow() const { return decode_pow(_obj); } 193 inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; } 194 195 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 196 197 static uintptr_t max_addressable() { 198 return nth_bit(oop_bits); 199 } 200 201 static int chunk_size() { 202 return nth_bit(chunk_bits); 203 } 204 205 private: 206 uintptr_t _obj; 207 }; 208 #else 209 class ObjArrayChunkedTask 210 { 211 public: 212 enum { 213 chunk_bits = 10, 214 pow_bits = 5, 215 }; 216 public: 217 ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) { 218 assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d", chunk); 219 assert(0 <= pow && pow < nth_bit(pow_bits), "pow is sane: %d", pow); 220 _chunk = chunk; 221 _pow = pow; 222 } 223 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { } 224 225 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { 226 _obj = t._obj; 227 _chunk = t._chunk; 228 _pow = t._pow; 229 return *this; 230 } 231 volatile ObjArrayChunkedTask& 232 operator =(const volatile ObjArrayChunkedTask& t) volatile { 233 (void)const_cast<oop&>(_obj = t._obj); 234 _chunk = t._chunk; 235 _pow = t._pow; 236 return *this; 237 } 238 239 inline oop obj() const { return _obj; } 240 inline int chunk() const { return _chunk; } 241 inline int pow() const { return _pow; } 242 243 inline bool is_not_chunked() const { return _chunk == 0; } 244 245 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 246 247 static size_t max_addressable() { 248 return sizeof(oop); 249 } 250 251 static int chunk_size() { 252 return nth_bit(chunk_bits); 253 } 254 255 private: 256 oop _obj; 257 int _chunk; 258 int _pow; 259 }; 260 #endif // SHENANDOAH_OPTIMIZED_OBJTASK 261 262 #ifdef _MSC_VER 263 #pragma warning(pop) 264 #endif 265 266 typedef ObjArrayChunkedTask ShenandoahMarkTask; 267 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue; 268 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue; 269 270 template <class T, MEMFLAGS F> 271 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> { 272 private: 273 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile jint)); 274 volatile jint _claimed_index; 275 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0); 276 277 debug_only(uint _reserved; ) 278 279 public: 280 using GenericTaskQueueSet<T, F>::size; 281 282 public: 283 ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) { 284 debug_only(_reserved = 0; ) 285 } 286 287 void clear_claimed() { _claimed_index = 0; } 288 T* claim_next(); 289 290 // reserve queues that not for parallel claiming 291 void reserve(uint n) { 292 assert(n <= size(), "Sanity"); 293 _claimed_index = (jint)n; 294 debug_only(_reserved = n;) 295 } 296 297 debug_only(uint get_reserved() const { return (uint)_reserved; }) 298 }; 299 300 template <class T, MEMFLAGS F> 301 T* ParallelClaimableQueueSet<T, F>::claim_next() { 302 jint size = (jint)GenericTaskQueueSet<T, F>::size(); 303 304 if (_claimed_index >= size) { 305 return NULL; 306 } 307 308 jint index = Atomic::add(&_claimed_index, 1); 309 310 if (index <= size) { 311 return GenericTaskQueueSet<T, F>::queue((uint)index - 1); 312 } else { 313 return NULL; 314 } 315 } 316 317 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> { 318 public: 319 ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {} 320 321 bool is_empty(); 322 void clear(); 323 324 #if TASKQUEUE_STATS 325 static void print_taskqueue_stats_hdr(outputStream* const st); 326 void print_taskqueue_stats() const; 327 void reset_taskqueue_stats(); 328 #endif // TASKQUEUE_STATS 329 }; 330 331 class ShenandoahTerminatorTerminator : public TerminatorTerminator { 332 private: 333 ShenandoahHeap* _heap; 334 public: 335 ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { } 336 // return true, terminates immediately, even if there's remaining work left 337 virtual bool should_exit_termination() { return _heap->cancelled_gc(); } 338 }; 339 340 class ShenandoahTaskTerminator : public StackObj { 341 private: 342 OWSTTaskTerminator* const _terminator; 343 public: 344 ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set); 345 ~ShenandoahTaskTerminator(); 346 347 bool offer_termination(ShenandoahTerminatorTerminator* terminator) { 348 return _terminator->offer_termination(terminator); 349 } 350 351 void reset_for_reuse() { _terminator->reset_for_reuse(); } 352 bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); } 353 }; 354 355 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP